mv_inference: Place OpenCV dependent code from inference-engine-vision

author Inki Dae <inki.dae@samsung.com>

Wed, 12 Feb 2020 02:35:21 +0000 (11:35 +0900)

committer Inki Dae <inki.dae@samsung.com>

Tue, 14 Apr 2020 00:40:31 +0000 (09:40 +0900)
author Inki Dae <inki.dae@samsung.com>
Wed, 12 Feb 2020 02:35:21 +0000 (11:35 +0900)
committer Inki Dae <inki.dae@samsung.com>
Tue, 14 Apr 2020 00:40:31 +0000 (09:40 +0900)
diff --git a/mv_inference/inference/include/Inference.h b/mv_inference/inference/include/Inference.h

index 223ad155c30190e80fe299bb4b01d7451696199a..4c6a4f6e84ccace99c56cdcd99485afaeeb482d0 100755 (executable)
--- a/mv_inference/inference/include/Inference.h
+++ b/mv_inference/inference/include/Inference.h
@@ -24,6 +24,8 @@
  #include "inference_engine_error.h"
  #include "inference_engine_vision_impl.h"
  #include <mv_inference_type.h>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
  
  /**
   * @file Inference.h
@@ -31,6 +33,33 @@
   *        provides inference interface.
   */
  using namespace InferenceEngineInterface::Vision;
+
+typedef struct _ImageClassficationResults {
+    int number_of_classes;
+    std::vector<int> indices;
+    std::vector<std::string> names;
+    std::vector<float> confidences;
+} ImageClassificationResults; /**< structure ImageClassificationResults */
+
+typedef struct _ObjectDetectionResults {
+    int number_of_objects;
+    std::vector<int> indices;
+    std::vector<std::string> names;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> locations;
+} ObjectDetectionResults;  /**< structure ObjectDetectionResults */
+
+typedef struct _FaceDetectionResults {
+    int number_of_faces;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> locations;
+} FaceDetectionResults;  /**< structure FaceDetectionResults */
+
+typedef struct _FacialLandMarkDetectionResults {
+    int number_of_landmarks;
+    std::vector<cv::Point> locations;
+} FacialLandMarkDetectionResults;  /**< structure FacialLandMarkDetectionResults */
+
  namespace mediavision {
  namespace inference {
  
@@ -262,20 +291,33 @@ public:
  
  private:
         bool mCanRun; /**< The flag indicating ready to run Inference */
+    InferenceConfig mConfig;
+       inference_engine_capacity mBackendCapacity;
+       std::map<int, std::pair<std::string, bool>> mSupportedInferenceBackend;
+       cv::Size mInputSize;
+       int mCh;
+       int mDim;
+       double mDeviation;
+       double mMean;
+       double mThreshold;
+       int mOutputNumbers;
+       cv::Size mSourceSize;
+       cv::Mat mInputBuffer;
+       int mMatType;
  
-       InferenceConfig mConfig;
         mv_engine_config_h engine_config;
-       inference_engine_capacity mBackendCapacity;
  
         InferenceEngineVision * mBackend;
  
-       std::map<int, std::pair<std::string, bool>> mSupportedInferenceBackend;
         std::map<std::string, int> mModelFormats;
+    std::vector<std::string> mUserListName;
  
  private:
         void CheckSupportedInferenceBackend();
         int ConvertEngineErrorToVisionError(int error);
         int ConvertTargetTypes(int given_types);
+       int DoPreprocess(cv::Mat cvImg);
+       int SetUserFile(std::string filename);
  };
  
  } /* Inference */
diff --git a/mv_inference/inference/src/Inference.cpp b/mv_inference/inference/src/Inference.cpp

index ee72a6d66a42ffff97408534f243648c082519a3..4ef3f66fc0956416c5d8a9bcaa67dc39c090964a 100755 (executable)
--- a/mv_inference/inference/src/Inference.cpp
+++ b/mv_inference/inference/src/Inference.cpp
@@ -23,6 +23,7 @@
  #include <unistd.h>
  #include <fstream>
  #include <string>
+#include <queue>
  #include <algorithm>
  
  #define MV_INFERENCE_OUTPUT_NUMBERS_MAX 10
@@ -30,6 +31,16 @@
  #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MAX 1.0
  #define MV_INFERENCE_CONFIDENCE_THRESHOLD_MIN 0.0
  
+typedef enum  {
+  InputAttrNoType = 0,
+  InputAttrFloat32 = 1,
+  InputAttrInt32 = 2,
+  InputAttrUInt8 = 3,
+  InputAttrInt64 = 4,
+  InputAttrString = 5,
+  InputAttrBool = 6,
+} InputAttrType;
+
  namespace mediavision {
  namespace inference {
  InferenceConfig::InferenceConfig() :
@@ -53,7 +64,17 @@ Inference::Inference() :
         mCanRun(),
         mConfig(),
         mBackendCapacity(),
-       mSupportedInferenceBackend()
+       mSupportedInferenceBackend(),
+    mInputSize(cv::Size()),
+    mCh(0),
+    mDim(0),
+    mDeviation(0.0),
+    mMean(0.0),
+    mThreshold(0.0),
+    mOutputNumbers(0),
+    mSourceSize(cv::Size()),
+    mInputBuffer(cv::Mat()),
+    mMatType(0)
  {
         LOGI("ENTER");
  
@@ -178,6 +199,68 @@ int Inference::ConvertTargetTypes(int given_types)
         return target_types;
  }
  
+int Inference::DoPreprocess(cv::Mat cvImg)
+{
+    mSourceSize = cvImg.size();
+    int width = mInputSize.width;
+    int height = mInputSize.height;
+
+    cv::Mat sample;
+    if (cvImg.channels() == 3 && mCh == 1)
+        cv::cvtColor(cvImg, sample, cv::COLOR_BGR2GRAY);
+    else
+        sample = cvImg;
+
+    // size
+    cv::Mat sampleResized;
+    if (sample.size() != cv::Size(width, height))
+        cv::resize(sample, sampleResized, cv::Size(width, height));
+    else
+        sampleResized = sample;
+
+    // type
+    cv::Mat sampleFloat;
+    if (mCh == 3)
+        sampleResized.convertTo(sampleFloat, CV_32FC3);
+    else
+        sampleResized.convertTo(sampleFloat, CV_32FC1);
+
+    // normalize
+    cv::Mat sampleNormalized;
+    cv::Mat meanMat;
+    if (mCh == 3)
+        meanMat = cv::Mat(sampleFloat.size(), CV_32FC3, cv::Scalar((float)mMean, (float)mMean, (float)mMean));
+    else
+        meanMat = cv::Mat(sampleFloat.size(), CV_32FC1, cv::Scalar((float)mMean));
+
+    cv::subtract(sampleFloat, meanMat, sampleNormalized);
+
+    sampleNormalized /= (float)mDeviation;
+
+    sampleNormalized.convertTo(mInputBuffer, mMatType);
+
+    return MEDIA_VISION_ERROR_NONE;
+}
+
+int Inference::SetUserFile(std::string filename)
+{
+    std::ifstream fp(filename.c_str());
+    if (!fp.is_open()) {
+        return MEDIA_VISION_ERROR_INVALID_PATH;
+    }
+
+    std::string userListName;
+    while (!fp.eof()) {
+        std::getline(fp, userListName);
+        if (userListName.length())
+                       mUserListName.push_back(userListName);
+    }
+
+    fp.close();
+
+    return MEDIA_VISION_ERROR_NONE;
+}
+
  void Inference::ConfigureModelFiles(const std::string modelConfigFilePath,
                     const std::string modelWeightFilePath,
                     const std::string modelUserFilePath)
@@ -194,7 +277,7 @@ void Inference::ConfigureTensorInfo(int width,
                         double stdValue,
                         double meanValue)
  {
-    mConfig.mTensorInfo = {width, height, dim, ch};
+       mConfig.mTensorInfo = {width, height, dim, ch};
         mConfig.mStdValue = stdValue;
         mConfig.mMeanValue = meanValue;
  }
@@ -298,20 +381,22 @@ int Inference::Prepare(void)
  {
         LOGI("ENTER");
  
-       // Input Tensor Param
-       mBackend->SetInputTensorParamInput(mConfig.mTensorInfo.width,
-                    mConfig.mTensorInfo.height,
-                    mConfig.mTensorInfo.dim,
-                    mConfig.mTensorInfo.ch);
+       mCh = mConfig.mTensorInfo.ch;
+       mDim = mConfig.mTensorInfo.dim;
+       mInputSize = cv::Size(mConfig.mTensorInfo.width, mConfig.mTensorInfo.height);
+       LOGI("InputSize is %d x %d\n", mInputSize.width, mInputSize.height);
  
-       mBackend->SetInputTensorParamNorm(mConfig.mStdValue, mConfig.mMeanValue);
+       mDeviation = mConfig.mStdValue;
+       mMean = mConfig.mMeanValue;
+       LOGI("mean %.4f, deviation %.4f", mMean, mDeviation);
  
         mBackend->SetInputTensorParamNode(mConfig.mInputNodeName);
  
-       // Output Tensor Param
-       mBackend->SetOutputTensorParamNumbers(mConfig.mMaxOutputNumbers);
+       mOutputNumbers = mConfig.mMaxOutputNumbers;
+       LOGI("outputNumber %d", mOutputNumbers);
  
-       mBackend->SetOutputTensorParamThresHold(mConfig.mConfidenceThresHold);
+       mThreshold = mConfig.mConfidenceThresHold;
+       LOGI("threshold %.4f", mThreshold);
  
         mBackend->SetOutputTensorParamNodes(mConfig.mOutputNodeNames);
  
@@ -348,6 +433,19 @@ int Inference::Load(void)
  {
         LOGI("ENTER");
  
+    std::string label_file = mConfig.mUserFilePath;
+    size_t userFileLength = label_file.length();
+    if (userFileLength > 0 && access(label_file.c_str(), F_OK)) {
+        LOGE("Label file path in [%s] ", label_file.c_str());
+        return MEDIA_VISION_ERROR_INVALID_PARAMETER;
+    }
+
+    int ret = (userFileLength > 0) ? SetUserFile(label_file) : MEDIA_VISION_ERROR_NONE;
+    if (ret != MEDIA_VISION_ERROR_NONE) {
+        LOGE("Fail to load label file.");
+               return ret;
+    }
+
         // Check if model file is valid or not.
         std::string ext_str = mConfig.mWeightFilePath.substr(mConfig.mWeightFilePath.find_last_of(".") + 1);
         std::map<std::string, int>::iterator key = mModelFormats.find(ext_str);
@@ -378,19 +476,73 @@ int Inference::Load(void)
                 break;
         }
  
-       models.push_back(mConfig.mUserFilePath);
-
      // Request model loading to backend engine.
-    int ret = mBackend->Load(models, (inference_model_format_e)key->second);
+    ret = mBackend->Load(models, (inference_model_format_e)key->second);
         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
                 delete mBackend;
                 LOGE("Fail to load model");
                 mCanRun = false;
-               goto out;
+               std::vector<std::string>().swap(models);
+               return ConvertEngineErrorToVisionError(ret);
         }
  
+       //get type and allocate memory to mInputBuffer;
+       InputAttrType attrType = static_cast<InputAttrType>(mBackend->GetInputLayerAttrType());
+       if (attrType == InputAttrUInt8) {
+               LOGI("InputType is %d ch with UINT8", mCh);
+               if (mCh == 1) {
+                       mMatType = CV_8UC1;
+               } else if (mCh == 3) {
+                       mMatType = CV_8UC3;
+               } else {
+                       LOGE("Not supported");
+                       std::vector<std::string>().swap(models);
+                       return ConvertEngineErrorToVisionError(ret);;
+               }
+       }
+       else if (attrType == InputAttrFloat32) {
+               LOGI("InputType is %d ch with FLOAT32", mCh);
+               if (mCh == 1) {
+                       mMatType = CV_32FC1;
+               } else if (mCh == 3) {
+                       mMatType = CV_32FC3;
+               } else {
+                       LOGE("Not supported");
+                       std::vector<std::string>().swap(models);
+                       return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+               }
+       }
+       else {
+               LOGE("Not supported");
+               std::vector<std::string>().swap(models);
+               return MEDIA_VISION_ERROR_NOT_SUPPORTED_FORMAT;
+       }
+
+       tensor_t inputData;
+       std::vector<int> info{1, mMatType, mInputSize.height, mInputSize.width};
+       inputData.dimInfo.push_back(info);
+
+       // some plug-in (opencv) doesn't allocate memory for input while loading models
+       // But, others (tflite) allcate memory while loading.
+       // Thus, the SetInputData() will be implemented in plug-in such as OpenCV, but
+       // just leave empty in plug-in such as tflite.
+       ret = mBackend->SetInputDataBuffer(inputData);
+       if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+               LOGE("Fail to SetInputData");
+               return ConvertEngineErrorToVisionError(ret);;
+       }
+
+       void *dataPtr = mBackend->GetInputDataPtr();
+       if (dataPtr == nullptr) {
+               LOGE("input data address is null");
+               std::vector<std::string>().swap(models);
+               return MEDIA_VISION_ERROR_INTERNAL;
+       }
+
+       mInputBuffer = cv::Mat(mInputSize.height, mInputSize.width, mMatType, dataPtr);
+
         mCanRun = true;
-out:
+
         std::vector<std::string>().swap(models);
  
         LOGI("LEAVE");
@@ -438,7 +590,15 @@ int Inference::Run(mv_source_h mvSource, mv_rectangle_s *roi)
         }
  
         LOGE("Size: w:%d, h:%d", cvSource.size().width, cvSource.size().height);
-       ret = mBackend->Run(cvSource);
+
+       // Convert color space of input tensor data and then normalize it.
+       ret = DoPreprocess(cvSource);
+       if (ret != MEDIA_VISION_ERROR_NONE) {
+               LOGE("Fail to preprocess input tensor data.");
+               return ret;
+       }
+
+       ret = mBackend->Run();
  
         return ConvertEngineErrorToVisionError(ret);
  }
@@ -450,13 +610,63 @@ std::pair<std::string, bool> Inference::GetSupportedInferenceBackend(int backend
  
  int Inference::GetClassficationResults(ImageClassificationResults *classificationResults)
  {
-       ImageClassificationResults results;
-       int ret = mBackend->GetInferenceResult(results);
+       tensor_t outputData;
+       int ret = mBackend->GetInferenceResult(outputData);
         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
                 LOGE("Fail to GetClassificationResults");
                 return ConvertEngineErrorToVisionError(ret);
         }
  
+       // Will contain top N results in ascending order.
+       std::vector<std::pair<float, int>> top_results;
+       std::priority_queue<std::pair<float, int>,
+               std::vector<std::pair<float, int>>,
+               std::greater<std::pair<float, int>>> top_result_pq;
+       float value;
+
+       std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+       long count = inferDimInfo[0][1];
+       LOGI("count: %ld", count);
+
+       float *prediction = reinterpret_cast<float*>(inferResults[0]);
+       for (int i = 0; i < count; ++i) {
+               value = prediction[i];
+               // Only add it if it beats the threshold and has a chance at being in
+               // the top N.
+               top_result_pq.push(std::pair<float, int>(value, i));
+
+               // If at capacity, kick the smallest value out.
+               if (top_result_pq.size() > mOutputNumbers) {
+                       top_result_pq.pop();
+               }
+       }
+
+       // Copy to output vector and reverse into descending order.
+       while (!top_result_pq.empty()) {
+               top_results.push_back(top_result_pq.top());
+               top_result_pq.pop();
+       }
+       std::reverse(top_results.begin(), top_results.end());
+
+       int classIdx = -1;
+       ImageClassificationResults results;
+       results.number_of_classes = 0;
+       for (int idx = 0; idx < top_results.size(); ++idx) {
+               if (top_results[idx].first < mThreshold)
+                       continue;
+               LOGI("idx:%d", idx);
+               LOGI("classIdx: %d", top_results[idx].second);
+               LOGI("classProb: %f", top_results[idx].first);
+
+               classIdx = top_results[idx].second;
+               results.indices.push_back(classIdx);
+               results.confidences.push_back(top_results[idx].first);
+               results.names.push_back(mUserListName[classIdx]);
+               results.number_of_classes++;
+       }
+
         *classificationResults = results;
         LOGE("Inference: GetClassificationResults: %d\n", results.number_of_classes);
         return MEDIA_VISION_ERROR_NONE;
@@ -464,13 +674,51 @@ int Inference::GetClassficationResults(ImageClassificationResults *classificatio
  
  int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResults)
  {
-       ObjectDetectionResults results;
-       int ret = mBackend->GetInferenceResult(results);
+       tensor_t outputData;
+       int ret = mBackend->GetInferenceResult(outputData);
         if (ret != INFERENCE_ENGINE_ERROR_NONE) {
                 LOGE("Fail to GetObjectDetectionResults");
                 return ConvertEngineErrorToVisionError(ret);
         }
  
+       std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+       float* boxes = reinterpret_cast<float*>(inferResults[0]);
+       float* classes = reinterpret_cast<float*>(inferResults[1]);
+       float* scores = reinterpret_cast<float*>(inferResults[2]);
+       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+
+       int left, top, right, bottom;
+       cv::Rect loc;
+
+       ObjectDetectionResults results;
+       results.number_of_objects = 0;
+       for (int idx = 0; idx < number_of_detections; ++idx) {
+               if (scores[idx] < mThreshold)
+                       continue;
+
+               left =   (int)(boxes[idx*4 + 1] * mSourceSize.width);
+               top  =   (int)(boxes[idx*4 + 0] * mSourceSize.height);
+               right  = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+               bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+               loc.x = left;
+               loc.y = top;
+               loc.width = right -left + 1;
+               loc.height = bottom - top + 1;
+
+               results.indices.push_back((int)classes[idx]);
+               results.confidences.push_back(scores[idx]);
+               results.names.push_back(mUserListName[(int)classes[idx]]);
+               results.locations.push_back(loc);
+               results.number_of_objects++;
+
+               LOGI("objectClass: %d", (int)classes[idx]);
+               LOGI("confidence:%f", scores[idx]);
+               LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+       }
+
         *detectionResults = results;
         LOGE("Inference: GetObjectDetectionResults: %d\n", results.number_of_objects);
         return MEDIA_VISION_ERROR_NONE;
@@ -478,8 +726,49 @@ int Inference::GetObjectDetectionResults(ObjectDetectionResults *detectionResult
  
  int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
  {
+       tensor_t outputData;
+       int ret = mBackend->GetInferenceResult(outputData);
+       if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+               LOGE("Fail to GetFaceDetectionResults");
+               return ConvertEngineErrorToVisionError(ret);
+       }
+
+       std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+       float* boxes = reinterpret_cast<float*>(inferResults[0]);
+       float* classes = reinterpret_cast<float*>(inferResults[1]);
+       float* scores = reinterpret_cast<float*>(inferResults[2]);
+
+       int number_of_detections = (int)(*reinterpret_cast<float*>(inferResults[3]));
+       int left, top, right, bottom;
+       cv::Rect loc;
+
         FaceDetectionResults results;
-       mBackend->GetInferenceResult(results);
+       results.number_of_faces = 0;
+       for (int idx = 0; idx < number_of_detections; ++idx) {
+               if (scores[idx] < mThreshold)
+                       continue;
+
+               left =   (int)(boxes[idx*4 + 1] * mSourceSize.width);
+               top  =   (int)(boxes[idx*4 + 0] * mSourceSize.height);
+               right  = (int)(boxes[idx*4 + 3] * mSourceSize.width);
+               bottom = (int)(boxes[idx*4 + 2] * mSourceSize.height);
+
+               loc.x = left;
+               loc.y = top;
+               loc.width = right -left + 1;
+               loc.height = bottom - top + 1;
+
+               results.confidences.push_back(scores[idx]);
+               results.locations.push_back(loc);
+               results.number_of_faces++;
+
+               LOGI("confidence:%f", scores[idx]);
+               LOGI("class: %f", classes[idx]);
+               LOGI("left:%f, top:%f, right:%f, bottom:%f", boxes[idx*4 + 1], boxes[idx*4 + 0], boxes[idx*4 + 3], boxes[idx*4 + 2]);
+               LOGI("left:%d, top:%d, right:%d, bottom:%d", left, top, right, bottom);
+       }
  
         *detectionResults = results;
         LOGE("Inference: GetFaceDetectionResults: %d\n", results.number_of_faces);
@@ -488,8 +777,34 @@ int Inference::GetFaceDetectionResults(FaceDetectionResults *detectionResults)
  
  int Inference::GetFacialLandMarkDetectionResults(FacialLandMarkDetectionResults *detectionResults)
  {
+       tensor_t outputData;
+       int ret = mBackend->GetInferenceResult(outputData);
+       if (ret != INFERENCE_ENGINE_ERROR_NONE) {
+               LOGE("Fail to GetFacialLandMarkDetectionResults");
+               return ConvertEngineErrorToVisionError(ret);
+       }
+
+       std::vector<std::vector<int>> inferDimInfo(outputData.dimInfo);
+       std::vector<void*> inferResults(outputData.data.begin(), outputData.data.end());
+
+       long number_of_detections = inferDimInfo[0][1];
+       float* loc = reinterpret_cast<float*>(inferResults[0]);
+
         FacialLandMarkDetectionResults results;
-       mBackend->GetInferenceResult(results);
+       results.number_of_landmarks = 0;
+
+       cv::Point point(0,0);
+       results.number_of_landmarks = 0;
+       LOGI("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height);
+       for (int idx = 0; idx < number_of_detections; idx+=2) {
+               point.x = (int)(loc[idx] * mSourceSize.width);
+               point.y = (int)(loc[idx+1] * mSourceSize.height);
+
+               results.locations.push_back(point);
+               results.number_of_landmarks++;
+
+               LOGI("x:%d, y:%d", point.x, point.y);
+       }
  
         *detectionResults = results;
         LOGE("Inference: FacialLandmarkDetectionResults: %d\n", results.number_of_landmarks);
author	Inki Dae <inki.dae@samsung.com>
	Wed, 12 Feb 2020 02:35:21 +0000 (11:35 +0900)
committer	Inki Dae <inki.dae@samsung.com>
	Tue, 14 Apr 2020 00:40:31 +0000 (09:40 +0900)
mv_inference/inference/include/Inference.h		patch \| blob \| history
mv_inference/inference/src/Inference.cpp		patch \| blob \| history