From 17a0c1686b1de328765b94e44d2636c6cb15caf4 Mon Sep 17 00:00:00 2001 From: Ross Girshick Date: Thu, 9 Jan 2014 13:48:50 -0800 Subject: [PATCH] support for adding padding to windows in the window_data_layer --- models/pascal_finetune.prototxt | 1 + models/pascal_finetune_solver.prototxt | 2 +- models/pascal_finetune_val.prototxt | 1 + src/caffe/layers/window_data_layer.cpp | 144 ++++++++++++++++++++++++++------- src/caffe/proto/caffe.proto | 4 + 5 files changed, 124 insertions(+), 28 deletions(-) diff --git a/models/pascal_finetune.prototxt b/models/pascal_finetune.prototxt index 5ee9e8a..229898e 100644 --- a/models/pascal_finetune.prototxt +++ b/models/pascal_finetune.prototxt @@ -7,6 +7,7 @@ layers { meanfile: "/home/rbg/working/caffe-rbg/data/ilsvrc2012_mean.binaryproto" batchsize: 128 cropsize: 227 + context_pad: 16 mirror: true det_fg_threshold: 0.5 det_bg_threshold: 0.5 diff --git a/models/pascal_finetune_solver.prototxt b/models/pascal_finetune_solver.prototxt index 7ea9c5d..f53a1d5 100644 --- a/models/pascal_finetune_solver.prototxt +++ b/models/pascal_finetune_solver.prototxt @@ -11,4 +11,4 @@ max_iter: 100000 momentum: 0.9 weight_decay: 0.0005 snapshot: 10000 -snapshot_prefix: "./snapshots/pascal_finetune_train" +snapshot_prefix: "./snapshots/pascal_context16_finetune_train" diff --git a/models/pascal_finetune_val.prototxt b/models/pascal_finetune_val.prototxt index c73cf8f..53aab09 100644 --- a/models/pascal_finetune_val.prototxt +++ b/models/pascal_finetune_val.prototxt @@ -7,6 +7,7 @@ layers { meanfile: "/home/rbg/working/caffe-rbg/data/ilsvrc2012_mean.binaryproto" batchsize: 128 cropsize: 227 + context_pad: 16 mirror: true det_fg_threshold: 0.5 det_bg_threshold: 0.5 diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index a2346bf..e8f52e7 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -39,6 +39,7 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { const Dtype scale = layer->layer_param_.scale(); const int batchsize = layer->layer_param_.batchsize(); const int cropsize = layer->layer_param_.cropsize(); + const int context_pad = layer->layer_param_.context_pad(); const bool mirror = layer->layer_param_.mirror(); const float fg_fraction = layer->layer_param_.det_fg_fraction(); const Dtype* mean = layer->data_mean_.cpu_data(); @@ -47,6 +48,9 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { const int mean_height = layer->data_mean_.height(); cv::Size cv_crop_size(cropsize, cropsize); + // zero out batch + memset(top_data, 0, sizeof(Dtype)*layer->prefetch_data_->count()); + // CHECK_EQ(mean_width, mean_height); // CHECK_EQ(mean_width, 256); // CHECK_EQ(mean_off, 14); @@ -64,6 +68,11 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { ? layer->fg_windows_[rand() % layer->fg_windows_.size()] : layer->bg_windows_[rand() % layer->bg_windows_.size()]; + bool do_mirror = false; + if (mirror && rand() % 2) { + do_mirror = true; + } + // load the image containing the window std::pair > image = layer->image_database_[window[WindowDataLayer::IMAGE_INDEX]]; @@ -77,35 +86,117 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { // CHECK_EQ(channels, 3); // crop window out of image and warp it - const int x1 = window[WindowDataLayer::X1]; - const int y1 = window[WindowDataLayer::Y1]; - const int x2 = window[WindowDataLayer::X2]; - const int y2 = window[WindowDataLayer::Y2]; + int x1 = window[WindowDataLayer::X1]; + int y1 = window[WindowDataLayer::Y1]; + int x2 = window[WindowDataLayer::X2]; + int y2 = window[WindowDataLayer::Y2]; + + int pad_w = 0; + int pad_h = 0; + if (context_pad > 0) { + // scale factor by which to expand the original region + // such that after warping the expanded region to cropsize x cropsize + // there's exactly context_pad amount of padding on each side + Dtype context_scale = static_cast(cropsize) / + static_cast(cropsize - 2*context_pad); + + // compute the expanded region + Dtype half_height = static_cast(y2-y1+1)/2.0; + Dtype half_width = static_cast(x2-x1+1)/2.0; + Dtype center_x = static_cast(x1) + half_width; + Dtype center_y = static_cast(y1) + half_height; + x1 = static_cast(round(center_x - half_width*context_scale)); + x2 = static_cast(round(center_x + half_width*context_scale)); + y1 = static_cast(round(center_y - half_height*context_scale)); + y2 = static_cast(round(center_y + half_height*context_scale)); + + // the expanded region may go outside of the image + // so we compute the clipped (expanded) region and keep track of + // the extent beyond the image + int unclipped_height = y2-y1+1; + int unclipped_width = x2-x1+1; + int pad_x1 = std::max(0, -x1); + int pad_y1 = std::max(0, -y1); + int pad_x2 = std::max(0, x2 - cv_img.cols + 1); + int pad_y2 = std::max(0, y2 - cv_img.rows + 1); + // clip bounds + x1 = x1 + pad_x1; + x2 = x2 - pad_x2; + y1 = y1 + pad_y1; + y2 = y2 - pad_y2; + CHECK_GT(x1, -1); + CHECK_GT(y1, -1); + CHECK_LT(x2, cv_img.cols); + CHECK_LT(y2, cv_img.rows); + + int clipped_height = y2-y1+1; + int clipped_width = x2-x1+1; + + // scale factors that would be used to warp the unclipped + // expanded region + Dtype scale_x = + static_cast(cropsize)/static_cast(unclipped_width); + Dtype scale_y = + static_cast(cropsize)/static_cast(unclipped_height); + + // size to warp the clipped expanded region to + cv_crop_size.width = + static_cast(round(static_cast(clipped_width)*scale_x)); + cv_crop_size.height = + static_cast(round(static_cast(clipped_height)*scale_y)); + pad_x1 = static_cast(round(static_cast(pad_x1)*scale_x)); + pad_x2 = static_cast(round(static_cast(pad_x2)*scale_x)); + pad_y1 = static_cast(round(static_cast(pad_y1)*scale_y)); + pad_y2 = static_cast(round(static_cast(pad_y2)*scale_y)); + + pad_h = pad_y1; + // if we're mirroring, we mirror the padding too (to be pedantic) + if (do_mirror) { + pad_w = pad_x2; + } else { + pad_w = pad_x1; + } + + // ensure that the warped, clipped region plus the padding + // fits in the cropsize x cropsize image (it might not due to rounding) + if (pad_h + cv_crop_size.height > cropsize) { + cv_crop_size.height = cropsize - pad_h; + } + if (pad_w + cv_crop_size.width > cropsize) { + cv_crop_size.width = cropsize - pad_w; + } + } + +// CHECK_GT(x1, -1); +// CHECK_GT(y1, -1); +// CHECK_LT(x1, cv_img.cols); +// CHECK_LT(y1, cv_img.rows); +// CHECK_GT(x2, x1-1); +// CHECK_GT(y2, y1-1); +// CHECK_LT(x2, cv_img.cols); +// CHECK_LT(y2, cv_img.rows); + cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1); cv::Mat cv_cropped_img = cv_img(roi); cv::resize(cv_cropped_img, cv_cropped_img, cv_crop_size, 0, 0, cv::INTER_LINEAR); // horizontal flip at random -// bool is_mirror = false; - if (mirror && rand() % 2) { + if (do_mirror) { cv::flip(cv_cropped_img, cv_cropped_img, 1); -// is_mirror = true; } - - // TODO(rbg): this could probably be made more efficient - // but this thread finishes before the GPU is ready, - // so it's fine for now + + // copy the warped window into top_data for (int c = 0; c < channels; ++c) { - for (int h = 0; h < cropsize; ++h) { - for (int w = 0; w < cropsize; ++w) { + for (int h = 0; h < cv_cropped_img.rows; ++h) { + for (int w = 0; w < cv_cropped_img.cols; ++w) { Dtype pixel = static_cast(cv_cropped_img.at(h, w)[c]); - top_data[((itemid * channels + c) * cropsize + h) * cropsize + w] + top_data[((itemid * channels + c) * cropsize + h + pad_h) * cropsize + w + pad_w] = (pixel - - mean[(c * mean_height + h + mean_off) - * mean_width + w + mean_off]) + - mean[(c * mean_height + h + mean_off + pad_h) + * mean_width + w + mean_off + pad_w]) * scale; } } @@ -120,17 +211,13 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { // ss >> file_id; // std::ofstream inf((string("dump/") + file_id + string("_info.txt")).c_str(), std::ofstream::out); // inf << image.first << std::endl -// << x1+1 << std::endl -// << y1+1 << std::endl -// << x2+1 << std::endl -// << y2+1 << std::endl -// << is_mirror << std::endl +// << window[WindowDataLayer::X1]+1 << std::endl +// << window[WindowDataLayer::Y1]+1 << std::endl +// << window[WindowDataLayer::X2]+1 << std::endl +// << window[WindowDataLayer::Y2]+1 << std::endl +// << do_mirror << std::endl // << top_label[itemid] << std::endl // << is_fg << std::endl; -//// << "is_fg: " << is_fg << std::endl -//// << "label: " << top_label[itemid] << " " << window[WindowDataLayer::LABEL] << std::endl -//// << "num bg samples: " << num_samples[0] << std::endl -//// << "num fg samples: " << num_samples[1]; // inf.close(); // std::ofstream top_data_file((string("dump/") + file_id + string("_data.txt")).c_str(), // std::ofstream::out | std::ofstream::binary); @@ -233,7 +320,7 @@ void WindowDataLayer::SetUp(const vector*>& bottom, label_hist[window[WindowDataLayer::LABEL]]++; } - if (image_index % 1 == 0) { + if (image_index % 100 == 0) { LOG(INFO) << "num: " << image_index << " " << image_path << " " << image_size[0] << " " @@ -243,12 +330,15 @@ void WindowDataLayer::SetUp(const vector*>& bottom, } } - LOG(INFO) << "Number of images: " << image_index; + LOG(INFO) << "Number of images: " << image_index+1; for (int i = 0; i < 21; ++i) { LOG(INFO) << "class " << i << " has " << label_hist[i] << " samples"; } + LOG(INFO) << "Amount of context padding: " + << this->layer_param_.context_pad(); + // image int cropsize = this->layer_param_.cropsize(); CHECK_GT(cropsize, 0); diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index a1b7776..5f82c19 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -104,6 +104,10 @@ message LayerParameter { // network's parameters when finetuning optional bool can_clobber = 57 [default = true]; + // Amount of contextual padding to add around a window + // (used only by the window_data_layer) + optional uint32 context_pad = 58 [default = 0]; + // For ReshapeLayer, one needs to specify the new dimensions. optional int32 new_num = 60 [default = 0]; optional int32 new_channels = 61 [default = 0]; -- 2.7.4