From 4992abecec7214bce3c07497438c2e1ff963e657 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sun, 8 Jun 2014 20:31:35 -0700
Subject: [PATCH] pycaffe Detector crops with surrounding context

- caffe.Detector learned how to crop windows with context in the R-CNN
  style s.t. the bordero of the network input is a given amount of
  context.
- add --context_pad arg to detect.py for amount of context. Default is
  16, as in R-CNN.
---
 python/caffe/detector.py | 91 ++++++++++++++++++++++++++++++++++++++++++++----
 python/detect.py         |  9 ++++-
 2 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/python/caffe/detector.py b/python/caffe/detector.py
index 5a30dab..b4e9602 100644
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@@ -12,10 +12,6 @@ This implementation follows ideas in
 The selective_search_ijcv_with_python code required for the selective search
 proposal mode is available at
     https://github.com/sergeyk/selective_search_ijcv_with_python
-
-TODO
-- R-CNN crop mode / crop with context.
-- Bundle with R-CNN model for example.
 """
 import numpy as np
 import os
@@ -29,11 +25,14 @@ class Detector(caffe.Net):
     selective search proposals.
     """
     def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
-                 input_scale=None, channel_swap=None):
+                 input_scale=None, channel_swap=None, context_pad=None):
         """
         Take
         gpu, mean_file, input_scale, channel_swap: convenience params for
             setting mode, mean, input scale, and channel order.
+        context_pad: amount of surrounding context to take s.t. a `context_pad`
+            sized border of pixels in the network input image is context, as in
+            R-CNN feature extraction.
         """
         caffe.Net.__init__(self, model_file, pretrained_file)
         self.set_phase_test()
@@ -50,6 +49,8 @@ class Detector(caffe.Net):
         if channel_swap:
             self.set_channel_swap(self.inputs[0], channel_swap)
 
+        self.configure_crop(context_pad)
+
 
     def detect_windows(self, images_windows):
         """
@@ -58,6 +59,7 @@ class Detector(caffe.Net):
 
         Take
         images_windows: (image filename, window list) iterable.
+        context_crop: size of context border to crop in pixels.
 
         Give
         detections: list of {filename: image filename, window: crop coordinates,
@@ -68,8 +70,7 @@ class Detector(caffe.Net):
         for image_fname, windows in images_windows:
             image = caffe.io.load_image(image_fname).astype(np.float32)
             for window in windows:
-                window_inputs.append(image[window[0]:window[2],
-                                           window[1]:window[3]])
+                window_inputs.append(self.crop(image, window))
 
         # Run through the net (warping windows to input dimensions).
         caffe_in = np.asarray([self.preprocess(self.inputs[0], window_in)
@@ -109,3 +110,79 @@ class Detector(caffe.Net):
         windows_list = selective_search.get_windows(image_fnames)
         # Run windowed detection on the selective search list.
         return self.detect_windows(zip(image_fnames, windows_list))
+
+
+    def crop(self, im, window):
+        """
+        Crop a window from the image for detection. Include surrounding context
+        according to the `context_pad` configuration.
+
+        Take
+        im: H x W x K image ndarray to crop.
+        window: bounding box coordinates as ymin, xmin, ymax, xmax.
+
+        Give
+        crop: cropped window.
+        """
+        # Crop window from the image.
+        crop = im[window[0]:window[2], window[1]:window[3]]
+
+        if self.context_pad:
+            box = window.copy()
+            crop_size = self.blobs[self.inputs[0]].width  # assumes square
+            scale = crop_size / (1. * crop_size - self.context_pad * 2)
+            # Crop a box + surrounding context.
+            half_h = (box[2] - box[0] + 1) / 2.
+            half_w = (box[3] - box[1] + 1) / 2.
+            center = (box[0] + half_h, box[1] + half_w)
+            scaled_dims = scale * np.array((-half_h, -half_w, half_h, half_w))
+            box = np.round(np.tile(center, 2) + scaled_dims)
+            full_h = box[2] - box[0] + 1
+            full_w = box[3] - box[1] + 1
+            scale_h = crop_size / full_h
+            scale_w = crop_size / full_w
+            pad_y = round(max(0, -box[0]) * scale_h)  # amount out-of-bounds
+            pad_x = round(max(0, -box[1]) * scale_w)
+
+            # Clip box to image dimensions.
+            im_h, im_w = im.shape[:2]
+            box = np.clip(box, 0., [im_h, im_w, im_h, im_w])
+            clip_h = box[2] - box[0] + 1
+            clip_w = box[3] - box[1] + 1
+            assert(clip_h > 0 and clip_w > 0)
+            crop_h = round(clip_h * scale_h)
+            crop_w = round(clip_w * scale_w)
+            if pad_y + crop_h > crop_size:
+                crop_h = crop_size - pad_y
+            if pad_x + crop_w > crop_size:
+                crop_w = crop_size - pad_x
+
+            # collect with context padding and place in input
+            # with mean padding
+            context_crop = im[box[0]:box[2], box[1]:box[3]]
+            context_crop = caffe.io.resize_image(context_crop, (crop_h, crop_w))
+            crop = self.crop_mean.copy()
+            crop[pad_y:(pad_y + crop_h), pad_x:(pad_x + crop_w)] = context_crop
+
+        return crop
+
+
+    def configure_crop(self, context_pad):
+        """
+        Configure amount of context for cropping.
+        If context is included, make the special input mean for context padding.
+
+        Take
+        context_pad: amount of context for cropping.
+        """
+        self.context_pad = context_pad
+        if self.context_pad:
+            input_scale = self.input_scale.get(self.inputs[0])
+            channel_order = self.channel_swap.get(self.inputs[0])
+            # Padding context crops needs the mean in unprocessed input space.
+            self.crop_mean = self.mean[self.inputs[0]].copy()
+            self.crop_mean = self.crop_mean.transpose((1,2,0))
+            channel_order_inverse = [channel_order.index(i)
+                                     for i in range(self.crop_mean.shape[2])]
+            self.crop_mean = self.crop_mean[:,:, channel_order_inverse]
+            self.crop_mean /= input_scale
diff --git a/python/detect.py b/python/detect.py
index 05b5244..a3bee5c 100755
--- a/python/detect.py
+++ b/python/detect.py
@@ -86,6 +86,12 @@ def main(argv):
              "RGB -> BGR since BGR is the Caffe default by way of OpenCV."
 
     )
+    parser.add_argument(
+        "--context_pad",
+        type=int,
+        default='16',
+        help="Amount of surrounding context to collect in input window."
+    )
     args = parser.parse_args()
 
     channel_swap = [int(s) for s in args.channel_swap.split(',')]
@@ -93,7 +99,8 @@ def main(argv):
     # Make detector.
     detector = caffe.Detector(args.model_def, args.pretrained_model,
             gpu=args.gpu, mean_file=args.mean_file,
-            input_scale=args.input_scale, channel_swap=channel_swap)
+            input_scale=args.input_scale, channel_swap=channel_swap,
+            context_pad=args.context_pad)
 
     if args.gpu:
         print 'GPU mode'
-- 
2.7.4