From 63323765f5d3192cfa3e3dc06fba79a475374ab2 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Mon, 1 Sep 2014 20:27:33 -0700
Subject: [PATCH] strategize pooling

Scaffold engine switching for pooling.

The Caffe pooling is instantiated without regard for engine in:
- LRNLayer
- PoolingLayer tests
- StochasticPoolingLayer tests
- MaxPoolingDropout tests
---
 include/caffe/vision_layers.hpp                    |  44 +++-
 src/caffe/layer_factory.cpp                        |  18 +-
 src/caffe/layers/caffe_pooling_layer.cpp           | 230 +++++++++++++++++++++
 .../{pooling_layer.cu => caffe_pooling_layer.cu}   |  55 ++---
 src/caffe/layers/lrn_layer.cpp                     |   2 +-
 src/caffe/layers/pooling_layer.cpp                 | 206 +-----------------
 src/caffe/test/test_maxpool_dropout_layers.cpp     |   6 +-
 src/caffe/test/test_pooling_layer.cpp              |  22 +-
 src/caffe/test/test_stochastic_pooling.cpp         |   8 +-
 9 files changed, 337 insertions(+), 254 deletions(-)
 create mode 100644 src/caffe/layers/caffe_pooling_layer.cpp
 rename src/caffe/layers/{pooling_layer.cu => caffe_pooling_layer.cu} (86%)
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 5bdd4f7..46df807 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -238,13 +238,13 @@ class PoolingLayer : public Layer<Dtype> {
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
+      vector<Blob<Dtype>*>* top) = 0;
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
+      vector<Blob<Dtype>*>* top) = 0;
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) = 0;
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) = 0;
 
   int kernel_h_, kernel_w_;
   int stride_h_, stride_w_;
@@ -258,6 +258,42 @@ class PoolingLayer : public Layer<Dtype> {
   Blob<int> max_idx_;
 };
 
+/* PoolingLayer
+*/
+template <typename Dtype>
+class CaffePoolingLayer : public PoolingLayer<Dtype> {
+ public:
+  explicit CaffePoolingLayer(const LayerParameter& param)
+      : PoolingLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_POOLING;
+  }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  // MAX POOL layers can output an extra top blob for the mask;
+  // others can only output the pooled inputs.
+  virtual inline int MaxTopBlobs() const {
+    return (this->layer_param_.pooling_param().pool() ==
+            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  Blob<Dtype> rand_idx_;
+  Blob<int> max_idx_;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_VISION_LAYERS_HPP_
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index ae9ff36..2ffb7f8 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -26,6 +26,22 @@ template ConvolutionLayer<float>* GetConvolutionLayer(const string& name,
 template ConvolutionLayer<double>* GetConvolutionLayer(const string& name,
     const LayerParameter& param);
 
+// Get pooling layer according to engine.
+template <typename Dtype>
+PoolingLayer<Dtype>* GetPoolingLayer(const string& name,
+    const LayerParameter& param) {
+  PoolingParameter_Engine engine = param.pooling_param().engine();
+  if (engine == PoolingParameter_Engine_CAFFE) {
+    return new CaffePoolingLayer<Dtype>(param);
+  } else {
+    LOG(FATAL) << "Layer " << name << " has unknown engine.";
+  }
+}
+
+template PoolingLayer<float>* GetPoolingLayer(const string& name,
+    const LayerParameter& param);
+template PoolingLayer<double>* GetPoolingLayer(const string& name,
+    const LayerParameter& param);
 
 // A function to get a specific layer from the specification given in
 // LayerParameter. Ideally this would be replaced by a factory pattern,
@@ -82,7 +98,7 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
   case LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS:
     return new MultinomialLogisticLossLayer<Dtype>(param);
   case LayerParameter_LayerType_POOLING:
-    return new PoolingLayer<Dtype>(param);
+    return GetPoolingLayer<Dtype>(name, param);
   case LayerParameter_LayerType_POWER:
     return new PowerLayer<Dtype>(param);
   case LayerParameter_LayerType_RELU:
diff --git a/src/caffe/layers/caffe_pooling_layer.cpp b/src/caffe/layers/caffe_pooling_layer.cpp
new file mode 100644
index 0000000..5756b74
--- /dev/null
+++ b/src/caffe/layers/caffe_pooling_layer.cpp
@@ -0,0 +1,230 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+using std::min;
+using std::max;
+
+template <typename Dtype>
+void CaffePoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
+  PoolingParameter pool_param = this->layer_param_.pooling_param();
+  // If max pooling, we will initialize the vector index part.
+  if (this->layer_param_.pooling_param().pool() ==
+      PoolingParameter_PoolMethod_MAX && top->size() == 1) {
+    max_idx_.Reshape(bottom[0]->num(), this->channels_, this->pooled_height_,
+        this->pooled_width_);
+  }
+  // If stochastic pooling, we will initialize the random index part.
+  if (this->layer_param_.pooling_param().pool() ==
+      PoolingParameter_PoolMethod_STOCHASTIC) {
+    rand_idx_.Reshape(bottom[0]->num(), this->channels_, this->pooled_height_,
+      this->pooled_width_);
+  }
+}
+
+// TODO(Yangqing): Is there a faster way to do pooling in the channel-first
+// case?
+template <typename Dtype>
+void CaffePoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const int top_count = (*top)[0]->count();
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top->size() > 1;
+  int* mask = NULL;  // suppress warnings about uninitalized variables
+  Dtype* top_mask = NULL;
+  // Different pooling methods. We explicitly do the switch outside the for
+  // loop to save time, although this results in more code.
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    // Initialize
+    if (use_top_mask) {
+      top_mask = (*top)[1]->mutable_cpu_data();
+      caffe_set(top_count, Dtype(-1), top_mask);
+    } else {
+      mask = max_idx_.mutable_cpu_data();
+      caffe_set(top_count, -1, mask);
+    }
+    caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+    // The main loop
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      for (int c = 0; c < this->channels_; ++c) {
+        for (int ph = 0; ph < this->pooled_height_; ++ph) {
+          for (int pw = 0; pw < this->pooled_width_; ++pw) {
+            int hstart = ph * this->stride_h_ - this->pad_h_;
+            int wstart = pw * this->stride_w_ - this->pad_w_;
+            int hend = min(hstart + this->kernel_h_, this->height_);
+            int wend = min(wstart + this->kernel_w_, this->width_);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            const int pool_index = ph * this->pooled_width_ + pw;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * this->width_ + w;
+                if (bottom_data[index] > top_data[pool_index]) {
+                  top_data[pool_index] = bottom_data[index];
+                  if (use_top_mask) {
+                    top_mask[pool_index] = static_cast<Dtype>(index);
+                  } else {
+                    mask[pool_index] = index;
+                  }
+                }
+              }
+            }
+          }
+        }
+        // compute offset
+        bottom_data += bottom[0]->offset(0, 1);
+        top_data += (*top)[0]->offset(0, 1);
+        if (use_top_mask) {
+          top_mask += (*top)[0]->offset(0, 1);
+        } else {
+          mask += (*top)[0]->offset(0, 1);
+        }
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    for (int i = 0; i < top_count; ++i) {
+      top_data[i] = 0;
+    }
+    // The main loop
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      for (int c = 0; c < this->channels_; ++c) {
+        for (int ph = 0; ph < this->pooled_height_; ++ph) {
+          for (int pw = 0; pw < this->pooled_width_; ++pw) {
+            int hstart = ph * this->stride_h_ - this->pad_h_;
+            int wstart = pw * this->stride_w_ - this->pad_w_;
+            int hend = min(hstart + this->kernel_h_,
+                this->height_ + this->pad_h_);
+            int wend = min(wstart + this->kernel_w_,
+                this->width_ + this->pad_w_);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, this->height_);
+            wend = min(wend, this->width_);
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                top_data[ph * this->pooled_width_ + pw] +=
+                    bottom_data[h * this->width_ + w];
+              }
+            }
+            top_data[ph * this->pooled_width_ + pw] /= pool_size;
+          }
+        }
+        // compute offset
+        bottom_data += bottom[0]->offset(0, 1);
+        top_data += (*top)[0]->offset(0, 1);
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    NOT_IMPLEMENTED;
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+}
+
+template <typename Dtype>
+void CaffePoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  // Different pooling methods. We explicitly do the switch outside the for
+  // loop to save time, although this results in more codes.
+  caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff);
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  const int* mask = NULL;  // suppress warnings about uninitialized variables
+  const Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    // The main loop
+    if (use_top_mask) {
+      top_mask = top[1]->cpu_data();
+    } else {
+      mask = max_idx_.cpu_data();
+    }
+    for (int n = 0; n < top[0]->num(); ++n) {
+      for (int c = 0; c < this->channels_; ++c) {
+        for (int ph = 0; ph < this->pooled_height_; ++ph) {
+          for (int pw = 0; pw < this->pooled_width_; ++pw) {
+            const int index = ph * this->pooled_width_ + pw;
+            const int bottom_index =
+                use_top_mask ? top_mask[index] : mask[index];
+            bottom_diff[bottom_index] += top_diff[index];
+          }
+        }
+        bottom_diff += (*bottom)[0]->offset(0, 1);
+        top_diff += top[0]->offset(0, 1);
+        if (use_top_mask) {
+          top_mask += top[0]->offset(0, 1);
+        } else {
+          mask += top[0]->offset(0, 1);
+        }
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // The main loop
+    for (int n = 0; n < top[0]->num(); ++n) {
+      for (int c = 0; c < this->channels_; ++c) {
+        for (int ph = 0; ph < this->pooled_height_; ++ph) {
+          for (int pw = 0; pw < this->pooled_width_; ++pw) {
+            int hstart = ph * this->stride_h_ - this->pad_h_;
+            int wstart = pw * this->stride_w_ - this->pad_w_;
+            int hend = min(hstart + this->kernel_h_,
+                this->height_ + this->pad_h_);
+            int wend = min(wstart + this->kernel_w_,
+                this->width_ + this->pad_w_);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, this->height_);
+            wend = min(wend, this->width_);
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                bottom_diff[h * this->width_ + w] +=
+                  top_diff[ph * this->pooled_width_ + pw] / pool_size;
+              }
+            }
+          }
+        }
+        // offset
+        bottom_diff += (*bottom)[0]->offset(0, 1);
+        top_diff += top[0]->offset(0, 1);
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    NOT_IMPLEMENTED;
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(CaffePoolingLayer);
+#endif
+
+INSTANTIATE_CLASS(CaffePoolingLayer);
+
+}  // namespace caffe
+
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/caffe_pooling_layer.cu
similarity index 86%
rename from src/caffe/layers/pooling_layer.cu
rename to src/caffe/layers/caffe_pooling_layer.cu
index e64128b..7a8c351 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/caffe_pooling_layer.cu
@@ -151,7 +151,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
 
 
 template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+void CaffePoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -169,17 +169,18 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
     // NOLINT_NEXT_LINE(whitespace/operators)
     MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
+        count, bottom_data, bottom[0]->num(), this->channels_, this->height_,
+        this->width_, this->pooled_height_, this->pooled_width_,
+        this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
+        this->pad_h_, this->pad_w_, top_data, mask, top_mask);
     break;
   case PoolingParameter_PoolMethod_AVE:
     // NOLINT_NEXT_LINE(whitespace/operators)
     AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
+        count, bottom_data, bottom[0]->num(), this->channels_, this->height_,
+        this->width_, this->pooled_height_, this->pooled_width_,
+        this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
+        this->pad_h_, this->pad_w_, top_data);
     break;
   case PoolingParameter_PoolMethod_STOCHASTIC:
     if (Caffe::phase() == Caffe::TRAIN) {
@@ -189,17 +190,18 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       // NOLINT_NEXT_LINE(whitespace/operators)
       StoPoolForwardTrain<Dtype><<<CAFFE_GET_BLOCKS(count),
                                    CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
+          count, bottom_data, bottom[0]->num(), this->channels_, this->height_,
+          this->width_, this->pooled_height_, this->pooled_width_,
+          this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
           rand_idx_.mutable_gpu_data(), top_data);
     } else {
       // NOLINT_NEXT_LINE(whitespace/operators)
       StoPoolForwardTest<Dtype><<<CAFFE_GET_BLOCKS(count),
                                   CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
+          count, bottom_data, bottom[0]->num(), this->channels_, this->height_,
+          this->width_, this->pooled_height_, this->pooled_width_,
+          this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
+          top_data);
     }
     break;
   default:
@@ -324,7 +326,7 @@ __global__ void StoPoolBackward(const int nthreads,
 
 
 template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void CaffePoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (!propagate_down[0]) {
     return;
@@ -346,24 +348,25 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
     // NOLINT_NEXT_LINE(whitespace/operators)
     MaxPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
+        count, top_diff, mask, top_mask, top[0]->num(), this->channels_,
+        this->height_, this->width_, this->pooled_height_, this->pooled_width_,
+        this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
+        this->pad_h_, this->pad_w_, bottom_diff);
     break;
   case PoolingParameter_PoolMethod_AVE:
     // NOLINT_NEXT_LINE(whitespace/operators)
     AvePoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+        count, top_diff, top[0]->num(), this->channels_, this->height_,
+        this->width_, this->pooled_height_, this->pooled_width_,
+        this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
+        this->pad_h_, this->pad_w_, bottom_diff);
     break;
   case PoolingParameter_PoolMethod_STOCHASTIC:
     // NOLINT_NEXT_LINE(whitespace/operators)
     StoPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
+        count, rand_idx_.gpu_data(), top_diff, top[0]->num(), this->channels_,
+        this->height_, this->width_, this->pooled_height_, this->pooled_width_,
+        this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_,
         bottom_diff);
     break;
   default:
@@ -373,7 +376,7 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 }
 
 
-INSTANTIATE_CLASS(PoolingLayer);
+INSTANTIATE_CLASS(CaffePoolingLayer);
 
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index e81a32b..c792f65 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -53,7 +53,7 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
           PoolingParameter_PoolMethod_AVE);
       pool_param.mutable_pooling_param()->set_pad(pre_pad_);
       pool_param.mutable_pooling_param()->set_kernel_size(size_);
-      pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
+      pool_layer_.reset(new CaffePoolingLayer<Dtype>(pool_param));
       pool_layer_->SetUp(square_top_vec_, &pool_top_vec_);
       CHECK_EQ(pool_output_.num(), num_);
       CHECK_EQ(pool_output_.channels(), channels_);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 9e77fa2..04f4776 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -52,9 +52,9 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     stride_w_ = pool_param.stride_w();
   }
   if (pad_h_ != 0 || pad_w_ != 0) {
-    CHECK(this->layer_param_.pooling_param().pool()
+    CHECK(pool_param.pool()
         == PoolingParameter_PoolMethod_AVE
-        || this->layer_param_.pooling_param().pool()
+        || pool_param.pool()
         == PoolingParameter_PoolMethod_MAX)
         << "Padding implemented only for average and max pooling.";
     CHECK_LT(pad_h_, kernel_h_);
@@ -84,210 +84,8 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   if (top->size() > 1) {
     (*top)[1]->ReshapeLike(*(*top)[0]);
   }
-  // If max pooling, we will initialize the vector index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX && top->size() == 1) {
-    max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-        pooled_width_);
-  }
-  // If stochastic pooling, we will initialize the random index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_STOCHASTIC) {
-    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
-  }
-}
-
-// TODO(Yangqing): Is there a faster way to do pooling in the channel-first
-// case?
-template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  const int top_count = (*top)[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top->size() > 1;
-  int* mask = NULL;  // suppress warnings about uninitalized variables
-  Dtype* top_mask = NULL;
-  // Different pooling methods. We explicitly do the switch outside the for
-  // loop to save time, although this results in more code.
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    // Initialize
-    if (use_top_mask) {
-      top_mask = (*top)[1]->mutable_cpu_data();
-      caffe_set(top_count, Dtype(-1), top_mask);
-    } else {
-      mask = max_idx_.mutable_cpu_data();
-      caffe_set(top_count, -1, mask);
-    }
-    caffe_set(top_count, Dtype(-FLT_MAX), top_data);
-    // The main loop
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_);
-            int wend = min(wstart + kernel_w_, width_);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            const int pool_index = ph * pooled_width_ + pw;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width_ + w;
-                if (bottom_data[index] > top_data[pool_index]) {
-                  top_data[pool_index] = bottom_data[index];
-                  if (use_top_mask) {
-                    top_mask[pool_index] = static_cast<Dtype>(index);
-                  } else {
-                    mask[pool_index] = index;
-                  }
-                }
-              }
-            }
-          }
-        }
-        // compute offset
-        bottom_data += bottom[0]->offset(0, 1);
-        top_data += (*top)[0]->offset(0, 1);
-        if (use_top_mask) {
-          top_mask += (*top)[0]->offset(0, 1);
-        } else {
-          mask += (*top)[0]->offset(0, 1);
-        }
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    for (int i = 0; i < top_count; ++i) {
-      top_data[i] = 0;
-    }
-    // The main loop
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_ + pad_h_);
-            int wend = min(wstart + kernel_w_, width_ + pad_w_);
-            int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height_);
-            wend = min(wend, width_);
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                top_data[ph * pooled_width_ + pw] +=
-                    bottom_data[h * width_ + w];
-              }
-            }
-            top_data[ph * pooled_width_ + pw] /= pool_size;
-          }
-        }
-        // compute offset
-        bottom_data += bottom[0]->offset(0, 1);
-        top_data += (*top)[0]->offset(0, 1);
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    NOT_IMPLEMENTED;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
 }
 
-template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-  // Different pooling methods. We explicitly do the switch outside the for
-  // loop to save time, although this results in more codes.
-  caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;  // suppress warnings about uninitialized variables
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    // The main loop
-    if (use_top_mask) {
-      top_mask = top[1]->cpu_data();
-    } else {
-      mask = max_idx_.cpu_data();
-    }
-    for (int n = 0; n < top[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            const int index = ph * pooled_width_ + pw;
-            const int bottom_index =
-                use_top_mask ? top_mask[index] : mask[index];
-            bottom_diff[bottom_index] += top_diff[index];
-          }
-        }
-        bottom_diff += (*bottom)[0]->offset(0, 1);
-        top_diff += top[0]->offset(0, 1);
-        if (use_top_mask) {
-          top_mask += top[0]->offset(0, 1);
-        } else {
-          mask += top[0]->offset(0, 1);
-        }
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // The main loop
-    for (int n = 0; n < top[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_ + pad_h_);
-            int wend = min(wstart + kernel_w_, width_ + pad_w_);
-            int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height_);
-            wend = min(wend, width_);
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                bottom_diff[h * width_ + w] +=
-                  top_diff[ph * pooled_width_ + pw] / pool_size;
-              }
-            }
-          }
-        }
-        // offset
-        bottom_diff += (*bottom)[0]->offset(0, 1);
-        top_diff += top[0]->offset(0, 1);
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    NOT_IMPLEMENTED;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-}
-
-
-#ifdef CPU_ONLY
-STUB_GPU(PoolingLayer);
-#endif
-
 INSTANTIATE_CLASS(PoolingLayer);
 
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_maxpool_dropout_layers.cpp b/src/caffe/test/test_maxpool_dropout_layers.cpp
index 311c778..ffafbec 100644
--- a/src/caffe/test/test_maxpool_dropout_layers.cpp
+++ b/src/caffe/test/test_maxpool_dropout_layers.cpp
@@ -46,7 +46,7 @@ TYPED_TEST(MaxPoolingDropoutTest, TestSetup) {
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
-  PoolingLayer<Dtype> max_layer(layer_param);
+  CaffePoolingLayer<Dtype> max_layer(layer_param);
   max_layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   DropoutLayer<Dtype> dropout_layer(layer_param);
   dropout_layer.SetUp(this->blob_top_vec_, &(this->blob_top_vec_));
@@ -63,7 +63,7 @@ TYPED_TEST(MaxPoolingDropoutTest, TestForward) {
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
   const Dtype* top_data = this->blob_top_->cpu_data();
@@ -93,7 +93,7 @@ TYPED_TEST(MaxPoolingDropoutTest, TestBackward) {
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
   for (int i = 0; i < this->blob_top_->count(); ++i) {
diff --git a/src/caffe/test/test_pooling_layer.cpp b/src/caffe/test/test_pooling_layer.cpp
index 5be01f7..4361949 100644
--- a/src/caffe/test/test_pooling_layer.cpp
+++ b/src/caffe/test/test_pooling_layer.cpp
@@ -72,7 +72,7 @@ class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
       blob_bottom_->mutable_cpu_data()[i + 13] = 2;
       blob_bottom_->mutable_cpu_data()[i + 14] = 3;
     }
-    PoolingLayer<Dtype> layer(layer_param);
+    CaffePoolingLayer<Dtype> layer(layer_param);
     layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
     EXPECT_EQ(blob_top_->num(), num);
     EXPECT_EQ(blob_top_->channels(), channels);
@@ -170,7 +170,7 @@ class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
       blob_bottom_->mutable_cpu_data()[i + 34] = 18;
       blob_bottom_->mutable_cpu_data()[i + 35] = 11;
     }
-    PoolingLayer<Dtype> layer(layer_param);
+    CaffePoolingLayer<Dtype> layer(layer_param);
     layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
     EXPECT_EQ(blob_top_->num(), num);
     EXPECT_EQ(blob_top_->channels(), channels);
@@ -295,7 +295,7 @@ class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
       blob_bottom_->mutable_cpu_data()[i + 34] = 18;
       blob_bottom_->mutable_cpu_data()[i + 35] = 11;
     }
-    PoolingLayer<Dtype> layer(layer_param);
+    CaffePoolingLayer<Dtype> layer(layer_param);
     layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
     EXPECT_EQ(blob_top_->num(), num);
     EXPECT_EQ(blob_top_->channels(), channels);
@@ -376,7 +376,7 @@ TYPED_TEST(PoolingLayerTest, TestSetup) {
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
   EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
@@ -392,7 +392,7 @@ TYPED_TEST(PoolingLayerTest, TestSetupPadded) {
   pooling_param->set_stride(2);
   pooling_param->set_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
   EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
@@ -450,7 +450,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientMax) {
       pooling_param->set_stride(2);
       pooling_param->set_pad(1);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-      PoolingLayer<Dtype> layer(layer_param);
+      CaffePoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-4, 1e-2);
       checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
           &(this->blob_top_vec_));
@@ -480,7 +480,7 @@ TYPED_TEST(PoolingLayerTest, TestForwardMaxPadded) {
   this->blob_bottom_->mutable_cpu_data()[6] = 4;
   this->blob_bottom_->mutable_cpu_data()[7] = 2;
   this->blob_bottom_->mutable_cpu_data()[8] = 1;
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -514,7 +514,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientMaxTopMask) {
       pooling_param->set_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       this->blob_top_vec_.push_back(this->blob_top_mask_);
-      PoolingLayer<Dtype> layer(layer_param);
+      CaffePoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-4, 1e-2);
       checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
           &(this->blob_top_vec_));
@@ -536,7 +536,7 @@ TYPED_TEST(PoolingLayerTest, TestForwardAve) {
   filler_param.set_value(Dtype(2));
   ConstantFiller<Dtype> filler(filler_param);
   filler.Fill(this->blob_bottom_);
-  PoolingLayer<Dtype> layer(layer_param);
+  CaffePoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -565,7 +565,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientAve) {
       pooling_param->set_kernel_w(kernel_w);
       pooling_param->set_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      PoolingLayer<Dtype> layer(layer_param);
+      CaffePoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
       checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
           &(this->blob_top_vec_));
@@ -584,7 +584,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientAvePadded) {
       pooling_param->set_stride(2);
       pooling_param->set_pad(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      PoolingLayer<Dtype> layer(layer_param);
+      CaffePoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
       checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
           &(this->blob_top_vec_));
diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
index 4f13981..51edbb3 100644
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ b/src/caffe/test/test_stochastic_pooling.cpp
@@ -52,7 +52,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestSetup) {
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
-  PoolingLayer<TypeParam> layer(layer_param);
+  CaffePoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
   EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
@@ -68,7 +68,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestStochasticGPU) {
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
+  CaffePoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
 
@@ -112,7 +112,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestStochasticGPUTestPhase) {
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
+  CaffePoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
 
@@ -150,7 +150,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
+  CaffePoolingLayer<TypeParam> layer(layer_param);
   GradientChecker<TypeParam> checker(1e-4, 1e-2);
   // it is too expensive to call curand multiple times, so we don't do an
   // exhaustive gradient check.
-- 
2.7.4