strategize cuDNN pooling
authorEvan Shelhamer <shelhamer@imaginarynumber.net>
Sat, 6 Sep 2014 04:47:56 +0000 (21:47 -0700)
committerEvan Shelhamer <shelhamer@imaginarynumber.net>
Sun, 7 Sep 2014 17:25:23 +0000 (19:25 +0200)
include/caffe/vision_layers.hpp
src/caffe/layer_factory.cpp
src/caffe/layers/cudnn_pooling_layer.cpp [new file with mode: 0644]
src/caffe/layers/cudnn_pooling_layer.cu [new file with mode: 0644]
src/caffe/test/test_pooling_layer.cpp

index 4269163..4dd2e9d 100644 (file)
@@ -69,7 +69,8 @@ class ConvolutionLayer : public Layer<Dtype> {
  *        Fallback to ConvolutionLayer for CPU mode.
 */
 template <typename Dtype>
-class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
+class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype>
+{
  public:
   explicit CuDNNConvolutionLayer(const LayerParameter& param)
       : ConvolutionLayer<Dtype>(param) {}
@@ -255,6 +256,33 @@ class PoolingLayer : public Layer<Dtype> {
   Blob<int> max_idx_;
 };
 
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of PoolingLayer.
+ *        Fallback to PoolingLayer for CPU mode.
+*/
+template <typename Dtype>
+class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
+ public:
+  explicit CuDNNPoolingLayer(const LayerParameter& param)
+      : PoolingLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual ~CuDNNPoolingLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  cudnnHandle_t             handle_;
+  cudnnTensor4dDescriptor_t bottom_desc_, top_desc_;
+  cudnnPoolingDescriptor_t  pooling_desc_;
+  cudnnPoolingMode_t        mode_;
+};
+#endif
+
 }  // namespace caffe
 
 #endif  // CAFFE_VISION_LAYERS_HPP_
index ef1b756..516fe87 100644 (file)
@@ -49,6 +49,10 @@ PoolingLayer<Dtype>* GetPoolingLayer(const string& name,
   }
   if (engine == PoolingParameter_Engine_CAFFE) {
     return new PoolingLayer<Dtype>(param);
+#ifdef USE_CUDNN
+  } else if (engine == PoolingParameter_Engine_CUDNN) {
+    return new CuDNNPoolingLayer<Dtype>(param);
+#endif
   } else {
     LOG(FATAL) << "Layer " << name << " has unknown engine.";
   }
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
new file mode 100644 (file)
index 0000000..c360b63
--- /dev/null
@@ -0,0 +1,40 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
+
+  // TODO(cudnn) check
+  cudnnStatus_t stat = cudnnCreate(&handle_);
+  CHECK_EQ(stat,CUDNN_STATUS_SUCCESS) << "Could not create a CUDNN handle.";
+  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
+      this->channels_, this->height_, this->width_);
+  cudnn::createTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
+      this->channels_, this->pooled_height_, this->pooled_width_);
+  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
+      this->layer_param_.pooling_param().pool(), &mode_,
+      this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_);
+}
+
+template <typename Dtype>
+CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
+  cudnnDestroyTensor4dDescriptor(bottom_desc_);
+  cudnnDestroyTensor4dDescriptor(top_desc_);
+  cudnnDestroyPoolingDescriptor(pooling_desc_);
+  cudnnDestroy(handle_);
+}
+
+INSTANTIATE_CLASS(CuDNNPoolingLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu
new file mode 100644 (file)
index 0000000..8f807ac
--- /dev/null
@@ -0,0 +1,55 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  // Fallback to Caffe for padded pooling, max top mask.
+  if ((this->pad_h_ > 0 || this->pad_w_ > 0) || (*top).size() > 1) {
+    LOG(WARNING) << "Falling back to standard Caffe for padded pooling.";
+    return PoolingLayer<Dtype>::Forward_gpu(bottom, top);
+  }
+
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  cudnnStatus_t stat = cudnnPoolingForward(handle_, pooling_desc_,
+      bottom_desc_, bottom_data, top_desc_, top_data);
+  CHECK_EQ(stat,CUDNN_STATUS_SUCCESS)
+      << "Error in cudnnPoolingForward.";
+}
+
+template <typename Dtype>
+void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+
+  // Fallback to Caffe for padded pooling, max top mask.
+  if ((this->pad_h_ > 0 || this->pad_w_ > 0) || top.size() > 1) {
+    LOG(WARNING) << "Falling back to standard Caffe for padded pooling.";
+    return PoolingLayer<Dtype>::Backward_gpu(top, propagate_down, bottom);
+  }
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+  cudnnStatus_t stat = cudnnPoolingBackward(handle_, pooling_desc_,
+      top_desc_, top_data, top_desc_, top_diff,
+      bottom_desc_, bottom_data, bottom_desc_, bottom_diff);
+  CHECK_EQ(stat,CUDNN_STATUS_SUCCESS) << "Error in cudnnPoolingBackward.";
+}
+
+INSTANTIATE_CLASS(CuDNNPoolingLayer);
+
+}  // namespace caffe
+#endif
index 5be01f7..ec23a68 100644 (file)
@@ -592,5 +592,587 @@ TYPED_TEST(PoolingLayerTest, TestGradientAvePadded) {
   }
 }
 
+#ifdef USE_CUDNN
+template <typename Dtype>
+class CuDNNPoolingLayerTest : public ::testing::Test {
+ protected:
+  CuDNNPoolingLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()),
+        blob_top_mask_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    blob_bottom_->Reshape(2, 3, 6, 5);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~CuDNNPoolingLayerTest() {
+    delete blob_bottom_;
+    delete blob_top_;
+    delete blob_top_mask_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_mask_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+  // Test for 2x 2 square pooling layer
+  void TestForwardSquare() {
+    LayerParameter layer_param;
+    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+    pooling_param->set_kernel_size(2);
+    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+    const int num = 2;
+    const int channels = 2;
+    blob_bottom_->Reshape(num, channels, 3, 5);
+    // Input: 2x 2 channels of:
+    //     [1 2 5 2 3]
+    //     [9 4 1 4 8]
+    //     [1 2 5 2 3]
+    for (int i = 0; i < 15 * num * channels; i += 15) {
+      blob_bottom_->mutable_cpu_data()[i +  0] = 1;
+      blob_bottom_->mutable_cpu_data()[i +  1] = 2;
+      blob_bottom_->mutable_cpu_data()[i +  2] = 5;
+      blob_bottom_->mutable_cpu_data()[i +  3] = 2;
+      blob_bottom_->mutable_cpu_data()[i +  4] = 3;
+      blob_bottom_->mutable_cpu_data()[i +  5] = 9;
+      blob_bottom_->mutable_cpu_data()[i +  6] = 4;
+      blob_bottom_->mutable_cpu_data()[i +  7] = 1;
+      blob_bottom_->mutable_cpu_data()[i +  8] = 4;
+      blob_bottom_->mutable_cpu_data()[i +  9] = 8;
+      blob_bottom_->mutable_cpu_data()[i + 10] = 1;
+      blob_bottom_->mutable_cpu_data()[i + 11] = 2;
+      blob_bottom_->mutable_cpu_data()[i + 12] = 5;
+      blob_bottom_->mutable_cpu_data()[i + 13] = 2;
+      blob_bottom_->mutable_cpu_data()[i + 14] = 3;
+    }
+    CuDNNPoolingLayer<Dtype> layer(layer_param);
+    layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
+    EXPECT_EQ(blob_top_->num(), num);
+    EXPECT_EQ(blob_top_->channels(), channels);
+    EXPECT_EQ(blob_top_->height(), 2);
+    EXPECT_EQ(blob_top_->width(), 4);
+    if (blob_top_vec_.size() > 1) {
+      EXPECT_EQ(blob_top_mask_->num(), num);
+      EXPECT_EQ(blob_top_mask_->channels(), channels);
+      EXPECT_EQ(blob_top_mask_->height(), 2);
+      EXPECT_EQ(blob_top_mask_->width(), 4);
+    }
+    layer.Forward(blob_bottom_vec_, &blob_top_vec_);
+    // Expected output: 2x 2 channels of:
+    //     [9 5 5 8]
+    //     [9 5 5 8]
+    for (int i = 0; i < 8 * num * channels; i += 8) {
+      EXPECT_EQ(blob_top_->cpu_data()[i + 0], 9);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 1], 5);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 2], 5);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 3], 8);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 4], 9);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 5], 5);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 6], 5);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 7], 8);
+    }
+    if (blob_top_vec_.size() > 1) {
+      // Expected mask output: 2x 2 channels of:
+      //     [5  2  2 9]
+      //     [5 12 12 9]
+      for (int i = 0; i < 8 * num * channels; i += 8) {
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0],  5);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1],  2);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2],  2);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3],  9);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4],  5);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 12);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 12);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7],  9);
+      }
+    }
+  }
+  // Test for 3x 2 rectangular pooling layer with kernel_h > kernel_w
+  void TestForwardRectHigh() {
+    LayerParameter layer_param;
+    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+    pooling_param->set_kernel_h(3);
+    pooling_param->set_kernel_w(2);
+    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+    const int num = 2;
+    const int channels = 2;
+    blob_bottom_->Reshape(num, channels, 6, 6);
+    // Input: 2x 2 channels of:
+    // [35     1     6    26    19    24]
+    // [ 3    32     7    21    23    25]
+    // [31     9     2    22    27    20]
+    // [ 8    28    33    17    10    15]
+    // [30     5    34    12    14    16]
+    // [ 4    36    29    13    18    11]
+    // (this is generated by magic(6) in MATLAB)
+    for (int i = 0; i < 36 * num * channels; i += 36) {
+      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
+      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
+      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
+      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
+      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
+      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
+      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
+      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
+      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
+      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
+      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
+      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
+      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
+      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
+      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
+      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
+      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
+      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
+      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
+      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
+      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
+      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
+      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
+      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
+      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
+      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
+      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
+      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
+      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
+      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
+      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
+      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
+      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
+      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
+      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
+      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
+    }
+    CuDNNPoolingLayer<Dtype> layer(layer_param);
+    layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
+    EXPECT_EQ(blob_top_->num(), num);
+    EXPECT_EQ(blob_top_->channels(), channels);
+    EXPECT_EQ(blob_top_->height(), 4);
+    EXPECT_EQ(blob_top_->width(), 5);
+    if (blob_top_vec_.size() > 1) {
+      EXPECT_EQ(blob_top_mask_->num(), num);
+      EXPECT_EQ(blob_top_mask_->channels(), channels);
+      EXPECT_EQ(blob_top_mask_->height(), 4);
+      EXPECT_EQ(blob_top_mask_->width(), 5);
+    }
+    layer.Forward(blob_bottom_vec_, &blob_top_vec_);
+    // Expected output: 2x 2 channels of:
+    // [35    32    26    27    27]
+    // [32    33    33    27    27]
+    // [31    34    34    27    27]
+    // [36    36    34    18    18]
+    for (int i = 0; i < 20 * num * channels; i += 20) {
+      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 33);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 33);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 31);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 36);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 18);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
+    }
+    if (blob_top_vec_.size() > 1) {
+        // [ 1     8     4    17    17]
+        // [ 8    21    21    17    17]
+        // [13    27    27    17    17]
+        // [32    32    27    35    35]
+      for (int i = 0; i < 20 * num * channels; i += 20) {
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 20);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 20);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 12);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 31);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 34);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
+      }
+    }
+  }
+  // Test for rectangular pooling layer with kernel_w > kernel_h
+  void TestForwardRectWide() {
+    LayerParameter layer_param;
+    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+    pooling_param->set_kernel_h(2);
+    pooling_param->set_kernel_w(3);
+    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+    const int num = 2;
+    const int channels = 2;
+    blob_bottom_->Reshape(num, channels, 6, 6);
+    // Input: 2x 2 channels of:
+    // [35     1     6    26    19    24]
+    // [ 3    32     7    21    23    25]
+    // [31     9     2    22    27    20]
+    // [ 8    28    33    17    10    15]
+    // [30     5    34    12    14    16]
+    // [ 4    36    29    13    18    11]
+    // (this is generated by magic(6) in MATLAB)
+    for (int i = 0; i < 36 * num * channels; i += 36) {
+      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
+      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
+      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
+      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
+      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
+      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
+      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
+      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
+      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
+      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
+      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
+      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
+      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
+      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
+      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
+      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
+      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
+      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
+      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
+      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
+      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
+      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
+      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
+      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
+      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
+      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
+      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
+      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
+      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
+      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
+      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
+      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
+      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
+      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
+      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
+      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
+    }
+    CuDNNPoolingLayer<Dtype> layer(layer_param);
+    layer.SetUp(blob_bottom_vec_, &blob_top_vec_);
+    EXPECT_EQ(blob_top_->num(), num);
+    EXPECT_EQ(blob_top_->channels(), channels);
+    EXPECT_EQ(blob_top_->height(), 5);
+    EXPECT_EQ(blob_top_->width(), 4);
+    if (blob_top_vec_.size() > 1) {
+      EXPECT_EQ(blob_top_mask_->num(), num);
+      EXPECT_EQ(blob_top_mask_->channels(), channels);
+      EXPECT_EQ(blob_top_mask_->height(), 5);
+      EXPECT_EQ(blob_top_mask_->width(), 4);
+    }
+    layer.Forward(blob_bottom_vec_, &blob_top_vec_);
+    // Expected output: 2x 2 channels of:
+    // [35    32    26    26]
+    // [32    32    27    27]
+    // [33    33    33    27]
+    // [34    34    34    17]
+    // [36    36    34    18]
+    for (int i = 0; i < 20 * num * channels; i += 20) {
+      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 26);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 32);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 33);
+      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 33);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 33);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 27);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 17);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 36);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 34);
+      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
+    }
+    if (blob_top_vec_.size() > 1) {
+        // [ 1     8     4     4]
+        // [ 8     8    17    17]
+        // [21    21    21    17]
+        // [27    27    27    22]
+        // [32    32    27    35]
+      for (int i = 0; i < 20 * num * channels; i += 20) {
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3],  3);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4],  7);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 20);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 20);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 20);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 16);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 21);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 31);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 26);
+        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
+      }
+    }
+  }
+};
+
+TYPED_TEST_CASE(CuDNNPoolingLayerTest, TestDtypes);
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestSetupCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  LayerParameter layer_param;
+  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+  pooling_param->set_kernel_size(3);
+  pooling_param->set_stride(2);
+  CuDNNPoolingLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
+  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
+  EXPECT_EQ(this->blob_top_->height(), 3);
+  EXPECT_EQ(this->blob_top_->width(), 2);
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestSetupPaddedCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  LayerParameter layer_param;
+  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+  pooling_param->set_kernel_size(3);
+  pooling_param->set_stride(2);
+  pooling_param->set_pad(1);
+  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+  CuDNNPoolingLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
+  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
+  EXPECT_EQ(this->blob_top_->height(), 4);
+  EXPECT_EQ(this->blob_top_->width(), 3);
+}
+
+/*
+TYPED_TEST(CuDNNPoolingLayerTest, PrintBackwardCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  LayerParameter layer_param;
+  layer_param.set_kernelsize(3);
+  layer_param.set_stride(2);
+  layer_param.set_pool(LayerParameter_PoolMethod_MAX);
+  CuDNNPoolingLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    cout << "bottom data " << i << " " << this->blob_bottom_->cpu_data()[i] << endl;
+  }
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    cout << "top data " << i << " " << this->blob_top_->cpu_data()[i] << endl;
+  }
+
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = i;
+  }
+  layer.Backward(this->blob_top_vec_, true, &(this->blob_bottom_vec_));
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    cout << "bottom diff " << i << " " << this->blob_bottom_->cpu_diff()[i] << endl;
+  }
+}
+*/
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  this->TestForwardSquare();
+  this->TestForwardRectHigh();
+  this->TestForwardRectWide();
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxTopMaskCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  this->blob_top_vec_.push_back(this->blob_top_mask_);
+  this->TestForwardSquare();
+  this->TestForwardRectHigh();
+  this->TestForwardRectWide();
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
+    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
+      LayerParameter layer_param;
+      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+      pooling_param->set_kernel_h(kernel_h);
+      pooling_param->set_kernel_w(kernel_w);
+      pooling_param->set_stride(2);
+      pooling_param->set_pad(1);
+      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+      CuDNNPoolingLayer<TypeParam> layer(layer_param);
+      GradientChecker<TypeParam> checker(1e-4, 1e-2);
+      checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
+          &(this->blob_top_vec_));
+    }
+  }
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxPaddedCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  LayerParameter layer_param;
+  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+  pooling_param->set_kernel_size(3);
+  pooling_param->set_stride(2);
+  pooling_param->set_pad(2);
+  pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+  this->blob_bottom_->Reshape(1, 1, 3, 3);
+  // Input:
+  //     [ 1 2 4 ]
+  //     [ 2 3 2 ]
+  //     [ 4 2 1 ]
+  this->blob_bottom_->mutable_cpu_data()[0] = 1;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 4;
+  this->blob_bottom_->mutable_cpu_data()[3] = 2;
+  this->blob_bottom_->mutable_cpu_data()[4] = 3;
+  this->blob_bottom_->mutable_cpu_data()[5] = 2;
+  this->blob_bottom_->mutable_cpu_data()[6] = 4;
+  this->blob_bottom_->mutable_cpu_data()[7] = 2;
+  this->blob_bottom_->mutable_cpu_data()[8] = 1;
+  CuDNNPoolingLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), 1);
+  EXPECT_EQ(this->blob_top_->channels(), 1);
+  EXPECT_EQ(this->blob_top_->height(), 3);
+  EXPECT_EQ(this->blob_top_->width(), 3);
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  TypeParam epsilon = 1e-8;
+  // Output:
+  //     [ 1 4 4 ]
+  //     [ 4 4 4 ]
+  //     [ 4 4 1 ]
+  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 1, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[2], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[4], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[6], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[8], 1, epsilon);
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxTopMaskCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
+    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
+      LayerParameter layer_param;
+      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+      pooling_param->set_kernel_h(kernel_h);
+      pooling_param->set_kernel_w(kernel_w);
+      pooling_param->set_stride(2);
+      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+      this->blob_top_vec_.push_back(this->blob_top_mask_);
+      CuDNNPoolingLayer<TypeParam> layer(layer_param);
+      GradientChecker<TypeParam> checker(1e-4, 1e-2);
+      checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
+          &(this->blob_top_vec_));
+      this->blob_top_vec_.pop_back();
+    }
+  }
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestForwardAveCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  LayerParameter layer_param;
+  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+  pooling_param->set_kernel_size(3);
+  pooling_param->set_stride(1);
+  pooling_param->set_pad(1);
+  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+  this->blob_bottom_->Reshape(1, 1, 3, 3);
+  FillerParameter filler_param;
+  filler_param.set_value(TypeParam(2));
+  ConstantFiller<TypeParam> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  CuDNNPoolingLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), 1);
+  EXPECT_EQ(this->blob_top_->channels(), 1);
+  EXPECT_EQ(this->blob_top_->height(), 3);
+  EXPECT_EQ(this->blob_top_->width(), 3);
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  TypeParam epsilon = 1e-5;
+  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 8.0 / 9, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4.0 / 3, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[2], 8.0 / 9, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4.0 / 3, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[4], 2.0    , epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4.0 / 3, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[6], 8.0 / 9, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4.0 / 3, epsilon);
+  EXPECT_NEAR(this->blob_top_->cpu_data()[8], 8.0 / 9, epsilon);
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAveCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
+    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
+      LayerParameter layer_param;
+      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+      pooling_param->set_kernel_h(kernel_h);
+      pooling_param->set_kernel_w(kernel_w);
+      pooling_param->set_stride(2);
+      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+      CuDNNPoolingLayer<TypeParam> layer(layer_param);
+      GradientChecker<TypeParam> checker(1e-2, 1e-2);
+      checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
+          &(this->blob_top_vec_));
+    }
+  }
+}
+
+TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAvePaddedCuDNN) {
+  Caffe::set_mode(Caffe::GPU);
+  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
+    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
+      LayerParameter layer_param;
+      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+      pooling_param->set_kernel_h(kernel_h);
+      pooling_param->set_kernel_w(kernel_w);
+      pooling_param->set_stride(2);
+      pooling_param->set_pad(2);
+      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+      CuDNNPoolingLayer<TypeParam> layer(layer_param);
+      GradientChecker<TypeParam> checker(1e-2, 1e-2);
+      checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
+          &(this->blob_top_vec_));
+    }
+  }
+}
+
+#endif
 
 }  // namespace caffe