modules/dnn/src/model.cpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 #include "precomp.hpp"
   6 #include <algorithm>
   7 #include <iostream>
   8 #include <utility>
   9 #include <iterator>
  10
  11 #include <opencv2/imgproc.hpp>
  12
  13 namespace cv {
  14 namespace dnn {
  15
  16 struct Model::Impl
  17 {
  18     Size   size;
  19     Scalar mean;
  20     double  scale = 1.0;
  21     bool   swapRB = false;
  22     bool   crop = false;
  23     Mat    blob;
  24     std::vector<String> outNames;
  25
  26     void predict(Net& net, const Mat& frame, OutputArrayOfArrays outs)
  27     {
  28         if (size.empty())
  29             CV_Error(Error::StsBadSize, "Input size not specified");
  30
  31         blob = blobFromImage(frame, scale, size, mean, swapRB, crop);
  32         net.setInput(blob);
  33
  34         // Faster-RCNN or R-FCN
  35         if (net.getLayer(0)->outputNameToIndex("im_info") != -1)
  36         {
  37             Mat imInfo = (Mat_<float>(1, 3) << size.height, size.width, 1.6f);
  38             net.setInput(imInfo, "im_info");
  39         }
  40         net.forward(outs, outNames);
  41     }
  42 };
  43
  44 Model::Model() : impl(new Impl) {}
  45
  46 Model::Model(const String& model, const String& config)
  47     : Net(readNet(model, config)), impl(new Impl)
  48 {
  49     impl->outNames = getUnconnectedOutLayersNames();
  50     std::vector<MatShape> inLayerShapes;
  51     std::vector<MatShape> outLayerShapes;
  52     getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes);
  53     if (!inLayerShapes.empty() && inLayerShapes[0].size() == 4)
  54         impl->size = Size(inLayerShapes[0][3], inLayerShapes[0][2]);
  55 };
  56
  57 Model::Model(const Net& network) : Net(network), impl(new Impl)
  58 {
  59     impl->outNames = getUnconnectedOutLayersNames();
  60     std::vector<MatShape> inLayerShapes;
  61     std::vector<MatShape> outLayerShapes;
  62     getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes);
  63     if (!inLayerShapes.empty() && inLayerShapes[0].size() == 4)
  64         impl->size = Size(inLayerShapes[0][3], inLayerShapes[0][2]);
  65 };
  66
  67 Model& Model::setInputSize(const Size& size)
  68 {
  69     impl->size = size;
  70     return *this;
  71 }
  72
  73 Model& Model::setInputSize(int width, int height)
  74 {
  75     impl->size = Size(width, height);
  76     return *this;
  77 }
  78
  79 Model& Model::setInputMean(const Scalar& mean)
  80 {
  81     impl->mean = mean;
  82     return *this;
  83 }
  84
  85 Model& Model::setInputScale(double scale)
  86 {
  87     impl->scale = scale;
  88     return *this;
  89 }
  90
  91 Model& Model::setInputCrop(bool crop)
  92 {
  93     impl->crop = crop;
  94     return *this;
  95 }
  96
  97 Model& Model::setInputSwapRB(bool swapRB)
  98 {
  99     impl->swapRB = swapRB;
 100     return *this;
 101 }
 102
 103 void Model::setInputParams(double scale, const Size& size, const Scalar& mean,
 104                            bool swapRB, bool crop)
 105 {
 106     impl->size = size;
 107     impl->mean = mean;
 108     impl->scale = scale;
 109     impl->crop = crop;
 110     impl->swapRB = swapRB;
 111 }
 112
 113 void Model::predict(InputArray frame, OutputArrayOfArrays outs)
 114 {
 115     impl->predict(*this, frame.getMat(), outs);
 116 }
 117
 118 ClassificationModel::ClassificationModel(const String& model, const String& config)
 119     : Model(model, config) {};
 120
 121 ClassificationModel::ClassificationModel(const Net& network) : Model(network) {};
 122
 123 std::pair<int, float> ClassificationModel::classify(InputArray frame)
 124 {
 125     std::vector<Mat> outs;
 126     impl->predict(*this, frame.getMat(), outs);
 127     CV_Assert(outs.size() == 1);
 128
 129     double conf;
 130     cv::Point maxLoc;
 131     minMaxLoc(outs[0].reshape(1, 1), nullptr, &conf, nullptr, &maxLoc);
 132     return {maxLoc.x, static_cast<float>(conf)};
 133 }
 134
 135 void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
 136 {
 137     std::tie(classId, conf) = classify(frame);
 138 }
 139
 140 KeypointsModel::KeypointsModel(const String& model, const String& config)
 141     : Model(model, config) {};
 142
 143 KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
 144
 145 std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 146 {
 147
 148     int frameHeight = frame.getMat().size[0];
 149     int frameWidth = frame.getMat().size[1];
 150     std::vector<Mat> outs;
 151
 152     impl->predict(*this, frame.getMat(), outs);
 153     CV_Assert(outs.size() == 1);
 154     Mat output = outs[0];
 155
 156     const int nPoints = output.size[1];
 157     std::vector<Point2f> points;
 158
 159     // If output is a map, extract the keypoints
 160     if (output.dims == 4)
 161     {
 162         int height = output.size[2];
 163         int width = output.size[3];
 164
 165         // find the position of the keypoints (ignore the background)
 166         for (int n=0; n < nPoints - 1; n++)
 167         {
 168             // Probability map of corresponding keypoint
 169             Mat probMap(height, width, CV_32F, output.ptr(0, n));
 170
 171             Point2f p(-1, -1);
 172             Point maxLoc;
 173             double prob;
 174             minMaxLoc(probMap, NULL, &prob, NULL, &maxLoc);
 175             if (prob > thresh)
 176             {
 177                 p = maxLoc;
 178                 p.x *= (float)frameWidth / width;
 179                 p.y *= (float)frameHeight / height;
 180                 points.push_back(p);
 181             }
 182         }
 183     }
 184     // Otherwise the output is a vector of keypoints and we can just return it
 185     else
 186     {
 187         for (int n=0; n < nPoints; n++)
 188         {
 189             Point2f p;
 190             p.x = *output.ptr<float>(0, n, 0);
 191             p.y = *output.ptr<float>(0, n, 1);
 192             points.push_back(p);
 193         }
 194     }
 195     return points;
 196 }
 197
 198 SegmentationModel::SegmentationModel(const String& model, const String& config)
 199     : Model(model, config) {};
 200
 201 SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
 202
 203 void SegmentationModel::segment(InputArray frame, OutputArray mask)
 204 {
 205
 206     std::vector<Mat> outs;
 207     impl->predict(*this, frame.getMat(), outs);
 208     CV_Assert(outs.size() == 1);
 209     Mat score = outs[0];
 210
 211     const int chns = score.size[1];
 212     const int rows = score.size[2];
 213     const int cols = score.size[3];
 214
 215     mask.create(rows, cols, CV_8U);
 216     Mat classIds = mask.getMat();
 217     classIds.setTo(0);
 218     Mat maxVal(rows, cols, CV_32F, score.data);
 219
 220     for (int ch = 1; ch < chns; ch++)
 221     {
 222         for (int row = 0; row < rows; row++)
 223         {
 224             const float *ptrScore = score.ptr<float>(0, ch, row);
 225             uint8_t *ptrMaxCl = classIds.ptr<uint8_t>(row);
 226             float *ptrMaxVal = maxVal.ptr<float>(row);
 227             for (int col = 0; col < cols; col++)
 228             {
 229                 if (ptrScore[col] > ptrMaxVal[col])
 230                 {
 231                     ptrMaxVal[col] = ptrScore[col];
 232                     ptrMaxCl[col] = ch;
 233                 }
 234             }
 235         }
 236     }
 237 }
 238
 239 DetectionModel::DetectionModel(const String& model, const String& config)
 240     : Model(model, config) {};
 241
 242 DetectionModel::DetectionModel(const Net& network) : Model(network) {};
 243
 244 void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
 245                             CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
 246                             float confThreshold, float nmsThreshold)
 247 {
 248     std::vector<Mat> detections;
 249     impl->predict(*this, frame.getMat(), detections);
 250
 251     boxes.clear();
 252     confidences.clear();
 253     classIds.clear();
 254
 255     int frameWidth  = frame.cols();
 256     int frameHeight = frame.rows();
 257     if (getLayer(0)->outputNameToIndex("im_info") != -1)
 258     {
 259         frameWidth = impl->size.width;
 260         frameHeight = impl->size.height;
 261     }
 262
 263     std::vector<String> layerNames = getLayerNames();
 264     int lastLayerId = getLayerId(layerNames.back());
 265     Ptr<Layer> lastLayer = getLayer(lastLayerId);
 266
 267     std::vector<int> predClassIds;
 268     std::vector<Rect> predBoxes;
 269     std::vector<float> predConf;
 270     if (lastLayer->type == "DetectionOutput")
 271     {
 272         // Network produces output blob with a shape 1x1xNx7 where N is a number of
 273         // detections and an every detection is a vector of values
 274         // [batchId, classId, confidence, left, top, right, bottom]
 275         for (int i = 0; i < detections.size(); ++i)
 276         {
 277             float* data = (float*)detections[i].data;
 278             for (int j = 0; j < detections[i].total(); j += 7)
 279             {
 280                 float conf = data[j + 2];
 281                 if (conf < confThreshold)
 282                     continue;
 283
 284                 int left   = data[j + 3];
 285                 int top    = data[j + 4];
 286                 int right  = data[j + 5];
 287                 int bottom = data[j + 6];
 288                 int width  = right  - left + 1;
 289                 int height = bottom - top + 1;
 290
 291                 if (width <= 2 || height <= 2)
 292                 {
 293                     left   = data[j + 3] * frameWidth;
 294                     top    = data[j + 4] * frameHeight;
 295                     right  = data[j + 5] * frameWidth;
 296                     bottom = data[j + 6] * frameHeight;
 297                     width  = right  - left + 1;
 298                     height = bottom - top + 1;
 299                 }
 300
 301                 left   = std::max(0, std::min(left, frameWidth - 1));
 302                 top    = std::max(0, std::min(top, frameHeight - 1));
 303                 width  = std::max(1, std::min(width, frameWidth - left));
 304                 height = std::max(1, std::min(height, frameHeight - top));
 305                 predBoxes.emplace_back(left, top, width, height);
 306
 307                 predClassIds.push_back(static_cast<int>(data[j + 1]));
 308                 predConf.push_back(conf);
 309             }
 310         }
 311     }
 312     else if (lastLayer->type == "Region")
 313     {
 314         for (int i = 0; i < detections.size(); ++i)
 315         {
 316             // Network produces output blob with a shape NxC where N is a number of
 317             // detected objects and C is a number of classes + 4 where the first 4
 318             // numbers are [center_x, center_y, width, height]
 319             float* data = (float*)detections[i].data;
 320             for (int j = 0; j < detections[i].rows; ++j, data += detections[i].cols)
 321             {
 322
 323                 Mat scores = detections[i].row(j).colRange(5, detections[i].cols);
 324                 Point classIdPoint;
 325                 double conf;
 326                 minMaxLoc(scores, nullptr, &conf, nullptr, &classIdPoint);
 327
 328                 if (static_cast<float>(conf) < confThreshold)
 329                     continue;
 330
 331                 int centerX = data[0] * frameWidth;
 332                 int centerY = data[1] * frameHeight;
 333                 int width   = data[2] * frameWidth;
 334                 int height  = data[3] * frameHeight;
 335
 336                 int left = std::max(0, std::min(centerX - width / 2, frameWidth - 1));
 337                 int top  = std::max(0, std::min(centerY - height / 2, frameHeight - 1));
 338                 width    = std::max(1, std::min(width, frameWidth - left));
 339                 height   = std::max(1, std::min(height, frameHeight - top));
 340
 341                 predClassIds.push_back(classIdPoint.x);
 342                 predConf.push_back(static_cast<float>(conf));
 343                 predBoxes.emplace_back(left, top, width, height);
 344             }
 345         }
 346     }
 347     else
 348         CV_Error(Error::StsNotImplemented, "Unknown output layer type: \"" + lastLayer->type + "\"");
 349
 350     if (nmsThreshold)
 351     {
 352         std::vector<int> indices;
 353         NMSBoxes(predBoxes, predConf, confThreshold, nmsThreshold, indices);
 354
 355         boxes.reserve(indices.size());
 356         confidences.reserve(indices.size());
 357         classIds.reserve(indices.size());
 358
 359         for (int idx : indices)
 360         {
 361             boxes.push_back(predBoxes[idx]);
 362             confidences.push_back(predConf[idx]);
 363             classIds.push_back(predClassIds[idx]);
 364         }
 365     }
 366     else
 367     {
 368         boxes       = std::move(predBoxes);
 369         classIds    = std::move(predClassIds);
 370         confidences = std::move(predConf);
 371     }
 372
 373
 374
 375 }
 376
 377 }} // namespace