From 842af5e8659e64c8cd1c1a7210bfda81fc1c6c09 Mon Sep 17 00:00:00 2001
From: Sergio <sguada@gmail.com>
Date: Sun, 13 Apr 2014 20:06:38 -0700
Subject: [PATCH] Use mask_idx to compute backward Max Pooling

---
 src/caffe/layers/pooling_layer.cpp |  7 ----
 src/caffe/layers/pooling_layer.cu  | 65 +++++++++++++++++++++++---------------
 2 files changed, 40 insertions(+), 32 deletions(-)
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 014f47b..e052f78 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -143,8 +143,6 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     return;
   }
   const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   // Different pooling methods. We explicitly do the switch outside the for
   // loop to save time, although this results in more codes.
@@ -161,9 +159,6 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
             bottom_diff[mask[ph * pooled_width_ + pw]]+=top_diff[ph * pooled_width_ + pw];
           }
         }
-        // offset
-        bottom_data += (*bottom)[0]->offset(0, 1);
-        top_data += top[0]->offset(0, 1);
         bottom_diff += (*bottom)[0]->offset(0, 1);
         top_diff += top[0]->offset(0, 1);
         mask += top[0]->offset(0, 1);
@@ -194,8 +189,6 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           }
         }
         // offset
-        bottom_data += (*bottom)[0]->offset(0, 1);
-        top_data += top[0]->offset(0, 1);
         bottom_diff += (*bottom)[0]->offset(0, 1);
         top_diff += top[0]->offset(0, 1);
       }
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index ff46109..b88b097 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -195,40 +195,55 @@ Dtype PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   return Dtype(0.);
 }
 
+// template <typename Dtype>
+// __global__ void MaxPoolBackward(const int nthreads, const Dtype* bottom_data,
+//     const Dtype* top_data, const Dtype* top_diff,
+//     const int num, const int channels, const int height,
+//     const int width, const int pooled_height, const int pooled_width,
+//     const int ksize, const int stride, Dtype* bottom_diff, int* mask) {
+//   int index = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (index < nthreads) {
+//     // find out the local index
+//     // find out the local offset
+//     int w = index % width;
+//     int h = (index / width) % height;
+//     int c = (index / width / height) % channels;
+//     int n = index / width / height / channels;
+//     int phstart = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+//     int phend = min(h / stride + 1, pooled_height);
+//     int pwstart = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+//     int pwend = min(w / stride + 1, pooled_width);
+//     Dtype gradient = 0;
+//     Dtype bottom_datum =
+//         bottom_data[((n * channels + c) * height + h) * width + w];
+//     top_data += (n * channels + c) * pooled_height * pooled_width;
+//     top_diff += (n * channels + c) * pooled_height * pooled_width;
+//     //bottom_diff[index] += top_diff[mask[index]];
+//     for (int ph = phstart; ph < phend; ++ph) {
+//       for (int pw = pwstart; pw < pwend; ++pw) {
+//         gradient += top_diff[ph * pooled_width + pw] *
+//             (bottom_datum == top_data[ph * pooled_width + pw]);
+//       }
+//     }
+//     bottom_diff[index] = gradient;
+//   }  // (if index < nthreads)
+// }
+
 template <typename Dtype>
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* top_data, const Dtype* top_diff,
+__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_size, const int stride, Dtype* bottom_diff, int* mask) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-    int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1;
-    int phend = min(h / stride + 1, pooled_height);
-    int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1;
-    int pwend = min(w / stride + 1, pooled_width);
-    Dtype gradient = 0;
-    Dtype bottom_datum =
-        bottom_data[((n * channels + c) * height + h) * width + w];
-    top_data += (n * channels + c) * pooled_height * pooled_width;
-    top_diff += (n * channels + c) * pooled_height * pooled_width;
-    //bottom_diff[index] += top_diff[mask[index]];
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        gradient += top_diff[ph * pooled_width + pw] *
-            (bottom_datum == top_data[ph * pooled_width + pw]);
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    bottom_diff += (n * channels + c) * height * width;
+    bottom_diff[mask[index]] += top_diff[index];
+  }  
 }
 
-
 template <typename Dtype>
 __global__ void AvePoolBackward(const int nthreads, const Dtype* top_diff,
     const int num, const int channels, const int height,
-- 
2.7.4