lrn backward
authorYangqing Jia <jiayq84@gmail.com>
Sun, 22 Sep 2013 01:49:39 +0000 (18:49 -0700)
committerYangqing Jia <jiayq84@gmail.com>
Sun, 22 Sep 2013 01:49:39 +0000 (18:49 -0700)
src/caffeine/layers/lrn_layer.cu
src/caffeine/test/test_lrn_layer.cpp

index 5eb7efa..2dc5143 100644 (file)
@@ -43,7 +43,7 @@ __global__ void LRNFillScale(const int nthreads, const Dtype* in,
       ++head;
     }
     // subtract only
-    while (head < size + post_pad) {
+    while (head < channels + post_pad) {
       accum_scale -= in[(head - size) * step] * in[(head - size) * step];
       scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
       ++head;
@@ -51,6 +51,8 @@ __global__ void LRNFillScale(const int nthreads, const Dtype* in,
   }
 }
 
+
+// TODO: check if it would be faster to just put it into the previous kernel.
 template <typename Dtype>
 __global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
     const Dtype* scale, const Dtype negative_beta, Dtype* out) {
@@ -80,13 +82,82 @@ void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   CUDA_POST_KERNEL_CHECK;
 }
 
+
+template <typename Dtype>
+__global__ void LRNComputeDiff(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_data, const Dtype* scale, const Dtype* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio,
+    Dtype* bottom_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < nthreads) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    int pre_pad = size - (size + 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    Dtype accum_ratio = 0;
+    // accumulate values 
+    while (head < post_pad) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
+}
+
 template <typename Dtype>
 Dtype LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  NOT_IMPLEMENTED;
+  int n_threads = num_ * height_ * width_;
+  LRNComputeDiff<<<CAFFEINE_GET_BLOCKS(n_threads), CAFFEINE_CUDA_NUM_THREADS>>>(
+      n_threads, (*bottom)[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      (*bottom)[0]->mutable_gpu_diff());
   return Dtype(0.);
 }
 
+
 INSTANTIATE_CLASS(LRNLayer);
 
 }  // namespace caffeine
index 54daecb..f8cfb21 100644 (file)
@@ -26,7 +26,8 @@ class LRNLayerTest : public ::testing::Test {
       : blob_bottom_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()) {};
   virtual void SetUp() {
-    blob_bottom_->Reshape(2,7,3,3);
+    Caffeine::set_random_seed(1701);
+    blob_bottom_->Reshape(2, 7, 3, 3);
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -87,7 +88,7 @@ TYPED_TEST(LRNLayerTest, TestSetup) {
   EXPECT_EQ(this->blob_top_->width(), 3);
 }
 
-TYPED_TEST(LRNLayerTest, TestCPU) {
+TYPED_TEST(LRNLayerTest, TestCPUForward) {
   LayerParameter layer_param;
   LRNLayer<TypeParam> layer(layer_param);
   Caffeine::set_mode(Caffeine::CPU);
@@ -102,9 +103,17 @@ TYPED_TEST(LRNLayerTest, TestCPU) {
     EXPECT_LE(this->blob_top_->cpu_data()[i],
         top_reference.cpu_data()[i] + 1e-5);
   }
+}
 
+TYPED_TEST(LRNLayerTest, TestGPUForward) {
+  LayerParameter layer_param;
+  LRNLayer<TypeParam> layer(layer_param);
   Caffeine::set_mode(Caffeine::GPU);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  Blob<TypeParam> top_reference;
+  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+      &top_reference);
   for (int i = 0; i < this->blob_bottom_->count(); ++i) {
     EXPECT_GE(this->blob_top_->cpu_data()[i],
         top_reference.cpu_data()[i] - 1e-5);
@@ -116,13 +125,34 @@ TYPED_TEST(LRNLayerTest, TestCPU) {
 TYPED_TEST(LRNLayerTest, TestCPUGradient) {
   LayerParameter layer_param;
   LRNLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-2);
   Caffeine::set_mode(Caffeine::CPU);
-  // when testing the GPU gradient, let's do a small shape.
-  this->blob_bottom_->Reshape(2, 7, 3, 3);
-  FillerParameter filler_param;
-  GaussianFiller<TypeParam> filler(filler_param);
-  filler.Fill(this->blob_bottom_);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = 1.;
+  }
+  layer.Backward(this->blob_top_vec_, true, &(this->blob_bottom_vec_));
+  //for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+  //  std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i] << std::endl;
+  //}
+  checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
+} 
+
+TYPED_TEST(LRNLayerTest, TestGPUGradient) {
+  LayerParameter layer_param;
+  LRNLayer<TypeParam> layer(layer_param);
   GradientChecker<TypeParam> checker(1e-2, 1e-2);
+  Caffeine::set_mode(Caffeine::GPU);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = 1.;
+  }
+  layer.Backward(this->blob_top_vec_, true, &(this->blob_bottom_vec_));
+  //for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+  //  std::cout << "GPU diff " << this->blob_bottom_->cpu_diff()[i] << std::endl;
+  //}
   checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
 }