++head;
}
// subtract only
- while (head < size + post_pad) {
+ while (head < channels + post_pad) {
accum_scale -= in[(head - size) * step] * in[(head - size) * step];
scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
++head;
}
}
+
+// TODO: check if it would be faster to just put it into the previous kernel.
template <typename Dtype>
__global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
const Dtype* scale, const Dtype negative_beta, Dtype* out) {
CUDA_POST_KERNEL_CHECK;
}
+
+template <typename Dtype>
+__global__ void LRNComputeDiff(const int nthreads, const Dtype* bottom_data,
+ const Dtype* top_data, const Dtype* scale, const Dtype* top_diff,
+ const int num, const int channels, const int height,
+ const int width, const int size, const Dtype negative_beta,
+ const Dtype cache_ratio,
+ Dtype* bottom_diff) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index < nthreads) {
+ // find out the local offset
+ int w = index % width;
+ int h = (index / width) % height;
+ int n = index / width / height;
+ int offset = (n * channels * height + h) * width + w;
+ int step = height * width;
+ bottom_data += offset;
+ top_data += offset;
+ scale += offset;
+ top_diff += offset;
+ bottom_diff += offset;
+ int head = 0;
+ int pre_pad = size - (size + 1) / 2;
+ int post_pad = size - pre_pad - 1;
+ Dtype accum_ratio = 0;
+ // accumulate values
+ while (head < post_pad) {
+ accum_ratio += top_diff[head * step] * top_data[head * step] /
+ scale[head * step];
+ ++head;
+ }
+ // until we reach size, nothing needs to be subtracted
+ while (head < size) {
+ accum_ratio += top_diff[head * step] * top_data[head * step] /
+ scale[head * step];
+ bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+ * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+ bottom_data[(head - post_pad) * step] * accum_ratio;
+ ++head;
+ }
+ // both add and subtract
+ while (head < channels) {
+ accum_ratio += top_diff[head * step] * top_data[head * step] /
+ scale[head * step];
+ accum_ratio -= top_diff[(head - size) * step] *
+ top_data[(head - size) * step] / scale[(head - size) * step];
+ bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+ * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+ bottom_data[(head - post_pad) * step] * accum_ratio;
+ ++head;
+ }
+ // subtract only
+ while (head < channels + post_pad) {
+ accum_ratio -= top_diff[(head - size) * step] *
+ top_data[(head - size) * step] / scale[(head - size) * step];
+ bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+ * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+ bottom_data[(head - post_pad) * step] * accum_ratio;
+ ++head;
+ }
+ }
+}
+
template <typename Dtype>
Dtype LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
- NOT_IMPLEMENTED;
+ int n_threads = num_ * height_ * width_;
+ LRNComputeDiff<<<CAFFEINE_GET_BLOCKS(n_threads), CAFFEINE_CUDA_NUM_THREADS>>>(
+ n_threads, (*bottom)[0]->gpu_data(), top[0]->gpu_data(),
+ scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+ size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+ (*bottom)[0]->mutable_gpu_diff());
return Dtype(0.);
}
+
INSTANTIATE_CLASS(LRNLayer);
} // namespace caffeine
: blob_bottom_(new Blob<Dtype>()),
blob_top_(new Blob<Dtype>()) {};
virtual void SetUp() {
- blob_bottom_->Reshape(2,7,3,3);
+ Caffeine::set_random_seed(1701);
+ blob_bottom_->Reshape(2, 7, 3, 3);
// fill the values
FillerParameter filler_param;
GaussianFiller<Dtype> filler(filler_param);
EXPECT_EQ(this->blob_top_->width(), 3);
}
-TYPED_TEST(LRNLayerTest, TestCPU) {
+TYPED_TEST(LRNLayerTest, TestCPUForward) {
LayerParameter layer_param;
LRNLayer<TypeParam> layer(layer_param);
Caffeine::set_mode(Caffeine::CPU);
EXPECT_LE(this->blob_top_->cpu_data()[i],
top_reference.cpu_data()[i] + 1e-5);
}
+}
+TYPED_TEST(LRNLayerTest, TestGPUForward) {
+ LayerParameter layer_param;
+ LRNLayer<TypeParam> layer(layer_param);
Caffeine::set_mode(Caffeine::GPU);
+ layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+ Blob<TypeParam> top_reference;
+ this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+ &top_reference);
for (int i = 0; i < this->blob_bottom_->count(); ++i) {
EXPECT_GE(this->blob_top_->cpu_data()[i],
top_reference.cpu_data()[i] - 1e-5);
TYPED_TEST(LRNLayerTest, TestCPUGradient) {
LayerParameter layer_param;
LRNLayer<TypeParam> layer(layer_param);
+ GradientChecker<TypeParam> checker(1e-2, 1e-2);
Caffeine::set_mode(Caffeine::CPU);
- // when testing the GPU gradient, let's do a small shape.
- this->blob_bottom_->Reshape(2, 7, 3, 3);
- FillerParameter filler_param;
- GaussianFiller<TypeParam> filler(filler_param);
- filler.Fill(this->blob_bottom_);
+ layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+ layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ this->blob_top_->mutable_cpu_diff()[i] = 1.;
+ }
+ layer.Backward(this->blob_top_vec_, true, &(this->blob_bottom_vec_));
+ //for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ // std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i] << std::endl;
+ //}
+ checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(LRNLayerTest, TestGPUGradient) {
+ LayerParameter layer_param;
+ LRNLayer<TypeParam> layer(layer_param);
GradientChecker<TypeParam> checker(1e-2, 1e-2);
+ Caffeine::set_mode(Caffeine::GPU);
+ layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+ layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ this->blob_top_->mutable_cpu_diff()[i] = 1.;
+ }
+ layer.Backward(this->blob_top_vec_, true, &(this->blob_bottom_vec_));
+ //for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ // std::cout << "GPU diff " << this->blob_bottom_->cpu_diff()[i] << std::endl;
+ //}
checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
}