From 17a0c1686b1de328765b94e44d2636c6cb15caf4 Mon Sep 17 00:00:00 2001
From: Ross Girshick <rbg@eecs.berkeley.edu>
Date: Thu, 9 Jan 2014 13:48:50 -0800
Subject: [PATCH] support for adding padding to windows in the
 window_data_layer

---
 models/pascal_finetune.prototxt        |   1 +
 models/pascal_finetune_solver.prototxt |   2 +-
 models/pascal_finetune_val.prototxt    |   1 +
 src/caffe/layers/window_data_layer.cpp | 144 ++++++++++++++++++++++++++-------
 src/caffe/proto/caffe.proto            |   4 +
 5 files changed, 124 insertions(+), 28 deletions(-)
diff --git a/models/pascal_finetune.prototxt b/models/pascal_finetune.prototxt
index 5ee9e8a..229898e 100644
--- a/models/pascal_finetune.prototxt
+++ b/models/pascal_finetune.prototxt
@@ -7,6 +7,7 @@ layers {
     meanfile: "/home/rbg/working/caffe-rbg/data/ilsvrc2012_mean.binaryproto"
     batchsize: 128
     cropsize: 227
+    context_pad: 16
     mirror: true
     det_fg_threshold: 0.5
     det_bg_threshold: 0.5
diff --git a/models/pascal_finetune_solver.prototxt b/models/pascal_finetune_solver.prototxt
index 7ea9c5d..f53a1d5 100644
--- a/models/pascal_finetune_solver.prototxt
+++ b/models/pascal_finetune_solver.prototxt
@@ -11,4 +11,4 @@ max_iter: 100000
 momentum: 0.9
 weight_decay: 0.0005
 snapshot: 10000
-snapshot_prefix: "./snapshots/pascal_finetune_train"
+snapshot_prefix: "./snapshots/pascal_context16_finetune_train"
diff --git a/models/pascal_finetune_val.prototxt b/models/pascal_finetune_val.prototxt
index c73cf8f..53aab09 100644
--- a/models/pascal_finetune_val.prototxt
+++ b/models/pascal_finetune_val.prototxt
@@ -7,6 +7,7 @@ layers {
     meanfile: "/home/rbg/working/caffe-rbg/data/ilsvrc2012_mean.binaryproto"
     batchsize: 128
     cropsize: 227
+    context_pad: 16
     mirror: true
     det_fg_threshold: 0.5
     det_bg_threshold: 0.5
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index a2346bf..e8f52e7 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -39,6 +39,7 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
   const Dtype scale = layer->layer_param_.scale();
   const int batchsize = layer->layer_param_.batchsize();
   const int cropsize = layer->layer_param_.cropsize();
+  const int context_pad = layer->layer_param_.context_pad();
   const bool mirror = layer->layer_param_.mirror();
   const float fg_fraction = layer->layer_param_.det_fg_fraction();
   const Dtype* mean = layer->data_mean_.cpu_data();
@@ -47,6 +48,9 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
   const int mean_height = layer->data_mean_.height();
   cv::Size cv_crop_size(cropsize, cropsize);
 
+  // zero out batch
+  memset(top_data, 0, sizeof(Dtype)*layer->prefetch_data_->count());
+
 //  CHECK_EQ(mean_width, mean_height);
 //  CHECK_EQ(mean_width, 256);
 //  CHECK_EQ(mean_off, 14);
@@ -64,6 +68,11 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
           ? layer->fg_windows_[rand() % layer->fg_windows_.size()]
           : layer->bg_windows_[rand() % layer->bg_windows_.size()];
 
+      bool do_mirror = false;
+      if (mirror && rand() % 2) {
+        do_mirror = true;
+      }
+
       // load the image containing the window
       std::pair<std::string, vector<int> > image = 
           layer->image_database_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
@@ -77,35 +86,117 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
 //      CHECK_EQ(channels, 3);
 
       // crop window out of image and warp it
-      const int x1 = window[WindowDataLayer<Dtype>::X1];
-      const int y1 = window[WindowDataLayer<Dtype>::Y1];
-      const int x2 = window[WindowDataLayer<Dtype>::X2];
-      const int y2 = window[WindowDataLayer<Dtype>::Y2];
+      int x1 = window[WindowDataLayer<Dtype>::X1];
+      int y1 = window[WindowDataLayer<Dtype>::Y1];
+      int x2 = window[WindowDataLayer<Dtype>::X2];
+      int y2 = window[WindowDataLayer<Dtype>::Y2];
+
+      int pad_w = 0;
+      int pad_h = 0;
+      if (context_pad > 0) {
+        // scale factor by which to expand the original region 
+        // such that after warping the expanded region to cropsize x cropsize
+        // there's exactly context_pad amount of padding on each side
+        Dtype context_scale = static_cast<Dtype>(cropsize) /
+            static_cast<Dtype>(cropsize - 2*context_pad);
+
+        // compute the expanded region
+        Dtype half_height = static_cast<Dtype>(y2-y1+1)/2.0;
+        Dtype half_width = static_cast<Dtype>(x2-x1+1)/2.0;
+        Dtype center_x = static_cast<Dtype>(x1) + half_width;
+        Dtype center_y = static_cast<Dtype>(y1) + half_height;
+        x1 = static_cast<int>(round(center_x - half_width*context_scale));
+        x2 = static_cast<int>(round(center_x + half_width*context_scale));
+        y1 = static_cast<int>(round(center_y - half_height*context_scale));
+        y2 = static_cast<int>(round(center_y + half_height*context_scale));
+        
+        // the expanded region may go outside of the image
+        // so we compute the clipped (expanded) region and keep track of
+        // the extent beyond the image
+        int unclipped_height = y2-y1+1;
+        int unclipped_width = x2-x1+1;
+        int pad_x1 = std::max(0, -x1);
+        int pad_y1 = std::max(0, -y1);
+        int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
+        int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
+        // clip bounds
+        x1 = x1 + pad_x1;
+        x2 = x2 - pad_x2;
+        y1 = y1 + pad_y1;
+        y2 = y2 - pad_y2;
+        CHECK_GT(x1, -1);
+        CHECK_GT(y1, -1);
+        CHECK_LT(x2, cv_img.cols);
+        CHECK_LT(y2, cv_img.rows);
+
+        int clipped_height = y2-y1+1;
+        int clipped_width = x2-x1+1;
+
+        // scale factors that would be used to warp the unclipped 
+        // expanded region
+        Dtype scale_x = 
+            static_cast<Dtype>(cropsize)/static_cast<Dtype>(unclipped_width);
+        Dtype scale_y = 
+            static_cast<Dtype>(cropsize)/static_cast<Dtype>(unclipped_height);
+
+        // size to warp the clipped expanded region to
+        cv_crop_size.width = 
+            static_cast<int>(round(static_cast<Dtype>(clipped_width)*scale_x));
+        cv_crop_size.height = 
+            static_cast<int>(round(static_cast<Dtype>(clipped_height)*scale_y));
+        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1)*scale_x));
+        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2)*scale_x));
+        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1)*scale_y));
+        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2)*scale_y));
+
+        pad_h = pad_y1;
+        // if we're mirroring, we mirror the padding too (to be pedantic)
+        if (do_mirror) {
+          pad_w = pad_x2;
+        } else {
+          pad_w = pad_x1;
+        }
+
+        // ensure that the warped, clipped region plus the padding
+        // fits in the cropsize x cropsize image (it might not due to rounding)
+        if (pad_h + cv_crop_size.height > cropsize) {
+          cv_crop_size.height = cropsize - pad_h;
+        }
+        if (pad_w + cv_crop_size.width > cropsize) {
+          cv_crop_size.width = cropsize - pad_w;
+        }
+      }
+
+//      CHECK_GT(x1, -1);
+//      CHECK_GT(y1, -1);
+//      CHECK_LT(x1, cv_img.cols);
+//      CHECK_LT(y1, cv_img.rows);
+//      CHECK_GT(x2, x1-1);
+//      CHECK_GT(y2, y1-1);
+//      CHECK_LT(x2, cv_img.cols);
+//      CHECK_LT(y2, cv_img.rows);
+
       cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1);
       cv::Mat cv_cropped_img = cv_img(roi);
       cv::resize(cv_cropped_img, cv_cropped_img, 
           cv_crop_size, 0, 0, cv::INTER_LINEAR);
       
       // horizontal flip at random
-//      bool is_mirror = false;
-      if (mirror && rand() % 2) {
+      if (do_mirror) {
         cv::flip(cv_cropped_img, cv_cropped_img, 1);
-//        is_mirror = true;
       }
-      
-      // TODO(rbg): this could probably be made more efficient
-      // but this thread finishes before the GPU is ready, 
-      // so it's fine for now
+
+      // copy the warped window into top_data
       for (int c = 0; c < channels; ++c) {
-        for (int h = 0; h < cropsize; ++h) {
-          for (int w = 0; w < cropsize; ++w) {
+        for (int h = 0; h < cv_cropped_img.rows; ++h) {
+          for (int w = 0; w < cv_cropped_img.cols; ++w) {
             Dtype pixel = 
                 static_cast<Dtype>(cv_cropped_img.at<cv::Vec3b>(h, w)[c]);
 
-            top_data[((itemid * channels + c) * cropsize + h) * cropsize + w]
+            top_data[((itemid * channels + c) * cropsize + h + pad_h) * cropsize + w + pad_w]
                 = (pixel
-                    - mean[(c * mean_height + h + mean_off) 
-                           * mean_width + w + mean_off])
+                    - mean[(c * mean_height + h + mean_off + pad_h) 
+                           * mean_width + w + mean_off + pad_w])
                   * scale;
           }
         }
@@ -120,17 +211,13 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
 //      ss >> file_id;
 //      std::ofstream inf((string("dump/") + file_id + string("_info.txt")).c_str(), std::ofstream::out);
 //      inf << image.first << std::endl 
-//          << x1+1 << std::endl
-//          << y1+1 << std::endl
-//          << x2+1 << std::endl
-//          << y2+1 << std::endl
-//          << is_mirror << std::endl
+//          << window[WindowDataLayer<Dtype>::X1]+1 << std::endl
+//          << window[WindowDataLayer<Dtype>::Y1]+1 << std::endl
+//          << window[WindowDataLayer<Dtype>::X2]+1 << std::endl
+//          << window[WindowDataLayer<Dtype>::Y2]+1 << std::endl
+//          << do_mirror << std::endl
 //          << top_label[itemid] << std::endl
 //          << is_fg << std::endl;
-////          << "is_fg: " << is_fg << std::endl
-////          << "label: " << top_label[itemid] << " " << window[WindowDataLayer<Dtype>::LABEL] << std::endl
-////          << "num bg samples: " << num_samples[0] << std::endl
-////          << "num fg samples: " << num_samples[1];
 //      inf.close();
 //      std::ofstream top_data_file((string("dump/") + file_id + string("_data.txt")).c_str(), 
 //            std::ofstream::out | std::ofstream::binary);
@@ -233,7 +320,7 @@ void WindowDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       label_hist[window[WindowDataLayer::LABEL]]++;
     }
 
-    if (image_index % 1 == 0) {
+    if (image_index % 100 == 0) {
       LOG(INFO) << "num: " << image_index << " "
           << image_path << " " 
           << image_size[0] << " "
@@ -243,12 +330,15 @@ void WindowDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     }
   }
 
-  LOG(INFO) << "Number of images: " << image_index;
+  LOG(INFO) << "Number of images: " << image_index+1;
 
   for (int i = 0; i < 21; ++i) {
     LOG(INFO) << "class " << i << " has " << label_hist[i] << " samples";
   }
 
+  LOG(INFO) << "Amount of context padding: " 
+      << this->layer_param_.context_pad();
+
   // image
   int cropsize = this->layer_param_.cropsize();
   CHECK_GT(cropsize, 0);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a1b7776..5f82c19 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -104,6 +104,10 @@ message LayerParameter {
   // network's parameters when finetuning
   optional bool can_clobber = 57 [default = true];
 
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 58 [default = 0];
+
   // For ReshapeLayer, one needs to specify the new dimensions.
   optional int32 new_num = 60 [default = 0];
   optional int32 new_channels = 61 [default = 0];
-- 
2.7.4