samples/dnn/faster_rcnn.cpp

   1 // Faster-RCNN models use custom layer called 'Proposal' written in Python. To
   2 // map it into OpenCV's layer replace a layer node with [type: 'Python'] to the
   3 // following definition:
   4 // layer {
   5 //   name: 'proposal'
   6 //   type: 'Proposal'
   7 //   bottom: 'rpn_cls_prob_reshape'
   8 //   bottom: 'rpn_bbox_pred'
   9 //   bottom: 'im_info'
  10 //   top: 'rois'
  11 //   proposal_param {
  12 //     ratio: 0.5
  13 //     ratio: 1.0
  14 //     ratio: 2.0
  15 //     scale: 8
  16 //     scale: 16
  17 //     scale: 32
  18 //   }
  19 // }
  20 #include <iostream>
  21
  22 #include <opencv2/dnn.hpp>
  23 #include <opencv2/dnn/all_layers.hpp>
  24 #include <opencv2/imgproc.hpp>
  25 #include <opencv2/highgui.hpp>
  26
  27 using namespace cv;
  28 using namespace dnn;
  29
  30 const char* about = "This sample is used to run Faster-RCNN object detection "
  31                     "models from https://github.com/rbgirshick/py-faster-rcnn with OpenCV.";
  32
  33 const char* keys =
  34     "{ help  h |     | print help message  }"
  35     "{ proto p |     | path to .prototxt   }"
  36     "{ model m |     | path to .caffemodel }"
  37     "{ image i |     | path to input image }"
  38     "{ conf  c | 0.8 | minimal confidence  }";
  39
  40 const char* classNames[] = {
  41     "__background__",
  42     "aeroplane", "bicycle", "bird", "boat",
  43     "bottle", "bus", "car", "cat", "chair",
  44     "cow", "diningtable", "dog", "horse",
  45     "motorbike", "person", "pottedplant",
  46     "sheep", "sofa", "train", "tvmonitor"
  47 };
  48
  49 static const int kInpWidth = 800;
  50 static const int kInpHeight = 600;
  51
  52 int main(int argc, char** argv)
  53 {
  54     // Parse command line arguments.
  55     CommandLineParser parser(argc, argv, keys);
  56     if (argc == 1 || parser.has("help"))
  57     {
  58         std::cout << about << std::endl;
  59         return 0;
  60     }
  61
  62     String protoPath = parser.get<String>("proto");
  63     String modelPath = parser.get<String>("model");
  64     String imagePath = parser.get<String>("image");
  65     float confThreshold = parser.get<float>("conf");
  66     CV_Assert(!protoPath.empty(), !modelPath.empty(), !imagePath.empty());
  67
  68     // Load a model.
  69     Net net = readNetFromCaffe(protoPath, modelPath);
  70
  71     // Create a preprocessing layer that does final bounding boxes applying predicted
  72     // deltas to objects locations proposals and doing non-maximum suppression over it.
  73     LayerParams lp;
  74     lp.set("code_type", "CENTER_SIZE");               // An every bounding box is [xmin, ymin, xmax, ymax]
  75     lp.set("num_classes", 21);
  76     lp.set("share_location", (int)false);             // Separate predictions for different classes.
  77     lp.set("background_label_id", 0);
  78     lp.set("variance_encoded_in_target", (int)true);
  79     lp.set("keep_top_k", 100);
  80     lp.set("nms_threshold", 0.3);
  81     lp.set("normalized_bbox", (int)false);
  82     Ptr<Layer> detectionOutputLayer = DetectionOutputLayer::create(lp);
  83
  84     Mat img = imread(imagePath);
  85     resize(img, img, Size(kInpWidth, kInpHeight));
  86     Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
  87     Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
  88
  89     net.setInput(blob, "data");
  90     net.setInput(imInfo, "im_info");
  91
  92     std::vector<Mat> outs;
  93     std::vector<String> outNames(3);
  94     outNames[0] = "proposal";
  95     outNames[1] = "bbox_pred";
  96     outNames[2] = "cls_prob";
  97     net.forward(outs, outNames);
  98
  99     Mat proposals = outs[0].colRange(1, 5).clone();  // Only last 4 columns.
 100     Mat& deltas = outs[1];
 101     Mat& scores = outs[2];
 102
 103     // Reshape proposals from Nx4 to 1x1xN*4
 104     std::vector<int> shape(3, 1);
 105     shape[2] = (int)proposals.total();
 106     proposals = proposals.reshape(1, shape);
 107
 108     // Run postprocessing layer.
 109     std::vector<Mat> layerInputs(3), layerOutputs(1), layerInternals;
 110     layerInputs[0] = deltas.reshape(1, 1);
 111     layerInputs[1] = scores.reshape(1, 1);
 112     layerInputs[2] = proposals;
 113     detectionOutputLayer->forward(layerInputs, layerOutputs, layerInternals);
 114
 115     // Draw detections.
 116     Mat detections = layerOutputs[0];
 117     const float* data = (float*)detections.data;
 118     for (size_t i = 0; i < detections.total(); i += 7)
 119     {
 120         // An every detection is a vector [id, classId, confidence, left, top, right, bottom]
 121         float confidence = data[i + 2];
 122         if (confidence > confThreshold)
 123         {
 124             int classId = (int)data[i + 1];
 125             int left = max(0, min((int)data[i + 3], img.cols - 1));
 126             int top = max(0, min((int)data[i + 4], img.rows - 1));
 127             int right = max(0, min((int)data[i + 5], img.cols - 1));
 128             int bottom = max(0, min((int)data[i + 6], img.rows - 1));
 129
 130             // Draw a bounding box.
 131             rectangle(img, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
 132
 133             // Put a label with a class name and confidence.
 134             String label = cv::format("%s, %.3f", classNames[classId], confidence);
 135             int baseLine;
 136             Size labelSize = cv::getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
 137
 138             top = max(top, labelSize.height);
 139             rectangle(img, Point(left, top - labelSize.height),
 140                       Point(left + labelSize.width, top + baseLine),
 141                       Scalar(255, 255, 255), FILLED);
 142             putText(img, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0));
 143         }
 144     }
 145     imshow("frame", img);
 146     waitKey();
 147     return 0;
 148 }