#include <vector>
#include "caffe/common_layers.hpp"
-#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
- template <typename Dtype>
- void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
- bottom[0]->height(), bottom[0]->width());
-
- x_norm_.Reshape(bottom[0]->num(), bottom[0]->channels(),
- bottom[0]->height(), bottom[0]->width());
-
- // Figure out the dimensions
- N_ = bottom[0]->num();
- C_ = bottom[0]->channels();
- H_ = bottom[0]->height();
- W_ = bottom[0]->width();
- // mean
- spatial_mean_.Reshape(N_, C_, 1, 1);
- batch_mean_.Reshape(1, C_, 1, 1);
- // variance
- spatial_variance_.Reshape(N_, C_, 1, 1);
- batch_variance_.Reshape(1, C_, 1, 1);
- // buffer blod
- buffer_blob_.Reshape(N_, C_, H_, W_);
-
- // fill spatial multiplier
- spatial_sum_multiplier_.Reshape(1, 1, H_, W_);
- Dtype* spatial_multipl_data = spatial_sum_multiplier_.mutable_cpu_data();
- caffe_set(spatial_sum_multiplier_.count(), Dtype(1),
- spatial_multipl_data);
- caffe_set(spatial_sum_multiplier_.count(), Dtype(0),
- spatial_sum_multiplier_.mutable_cpu_diff());
- // fill batch multiplier
- batch_sum_multiplier_.Reshape(N_, 1, 1, 1);
- Dtype* batch_multiplier_data = batch_sum_multiplier_.mutable_cpu_data();
- caffe_set(batch_sum_multiplier_.count(), Dtype(1),
- batch_multiplier_data);
- caffe_set(batch_sum_multiplier_.count(), Dtype(0),
- batch_sum_multiplier_.mutable_cpu_diff());
- this->param_propagate_down_.resize(this->blobs_.size(), true);
- }
- template <typename Dtype>
- void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+template <typename Dtype>
+void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
- "allow in-place computation.";
-
- top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
- bottom[0]->height(), bottom[0]->width());
-
- x_norm_.Reshape(bottom[0]->num(), bottom[0]->channels(),
- bottom[0]->height(), bottom[0]->width());
- // Figure out the dimensions
- N_ = bottom[0]->num();
- C_ = bottom[0]->channels();
- H_ = bottom[0]->height();
- W_ = bottom[0]->width();
- var_eps_ = 1e-9;
-
- // mean
- spatial_mean_.Reshape(N_, C_, 1, 1);
- batch_mean_.Reshape(1, C_, 1, 1);
- // variance
- spatial_variance_.Reshape(N_, C_, 1, 1);
- batch_variance_.Reshape(1, C_, 1, 1);
- // buffer blod
- buffer_blob_.Reshape(N_, C_, H_, W_);
-
- // fill spatial multiplier
- spatial_sum_multiplier_.Reshape(1, 1, H_, W_);
- Dtype* spatial_multipl_data = spatial_sum_multiplier_.mutable_cpu_data();
- caffe_set(spatial_sum_multiplier_.count(), Dtype(1),
- spatial_multipl_data);
- caffe_set(spatial_sum_multiplier_.count(), Dtype(0),
- spatial_sum_multiplier_.mutable_cpu_diff());
-
- // fill batch multiplier
- batch_sum_multiplier_.Reshape(N_, 1, 1, 1);
- Dtype* batch_multiplier_data = batch_sum_multiplier_.mutable_cpu_data();
- caffe_set(batch_sum_multiplier_.count(), Dtype(1),
- batch_multiplier_data);
- caffe_set(batch_sum_multiplier_.count(), Dtype(0),
- batch_sum_multiplier_.mutable_cpu_diff());
-
- // Check if we need to set up the weights
- if (this->blobs_.size() > 0) {
- LOG(INFO) << "Skipping parameter initialization";
- } else {
- this->blobs_.resize(2);
-
- // fill scale with scale_filler
- this->blobs_[0].reset(new Blob<Dtype>(1, C_, 1, 1));
- caffe_set(this->blobs_[0]->count(), Dtype(1),
- this->blobs_[0]->mutable_cpu_data());
-
- // fill shift with shift_filler
- this->blobs_[1].reset(new Blob<Dtype>(1, C_, 1, 1));
- caffe_set(this->blobs_[1]->count(), Dtype(0),
- this->blobs_[1]->mutable_cpu_data());
- } // parameter initialization
- this->param_propagate_down_.resize(this->blobs_.size(), true);
+ BatchNormParameter param = this->layer_param_.batch_norm_param();
+ moving_average_fraction_ = param.moving_average_fraction();
+ use_global_stats_ = this->phase_ == TEST;
+ if (param.has_use_global_stats())
+ use_global_stats_ = param.use_global_stats();
+ if (bottom[0]->num_axes() == 1)
+ channels_ = 1;
+ else
+ channels_ = bottom[0]->shape(1);
+ eps_ = param.eps();
+ if (this->blobs_.size() > 0) {
+ LOG(INFO) << "Skipping parameter initialization";
+ } else {
+ this->blobs_.resize(3);
+ vector<int> sz;
+ sz.push_back(channels_);
+ this->blobs_[0].reset(new Blob<Dtype>(sz));
+ this->blobs_[1].reset(new Blob<Dtype>(sz));
+ sz[0]=1;
+ this->blobs_[2].reset(new Blob<Dtype>(sz));
+ for (int i = 0; i < 3; ++i) {
+ caffe_set(this->blobs_[i]->count(), Dtype(0),
+ this->blobs_[i]->mutable_cpu_data());
+ }
}
+}
- template <typename Dtype>
- void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- const Dtype* bottom_data = bottom[0]->cpu_data();
- Dtype* top_data = top[0]->mutable_cpu_data();
- const Dtype* const_top_data = top[0]->cpu_data();
-
- const Dtype* scale_data = this->blobs_[0]->cpu_data();
- const Dtype* shift_data = this->blobs_[1]->cpu_data();
-
- // put the squares of bottom into buffer_blob_
- caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
- buffer_blob_.mutable_cpu_data());
+ if (bottom[0]->num_axes() >= 1)
+ CHECK_EQ(bottom[0]->shape(1), channels_);
+ top[0]->ReshapeLike(*bottom[0]);
+
+ vector<int> sz;
+ sz.push_back(channels_);
+ mean_.Reshape(sz);
+ variance_.Reshape(sz);
+ temp_.ReshapeLike(*bottom[0]);
+ x_norm_.ReshapeLike(*bottom[0]);
+ sz[0]=bottom[0]->shape(0);
+ batch_sum_multiplier_.Reshape(sz);
+
+ int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
+ if (spatial_sum_multiplier_.num_axes() == 0 ||
+ spatial_sum_multiplier_.shape(0) != spatial_dim) {
+ sz[0] = spatial_dim;
+ spatial_sum_multiplier_.Reshape(sz);
+ Dtype* multiplier_data = spatial_sum_multiplier_.mutable_cpu_data();
+ caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
+ }
+ int numbychans = channels_*bottom[0]->shape(0);
+ if (num_by_chans_.num_axes() == 0 ||
+ num_by_chans_.shape(0) != numbychans) {
+ sz[0] = numbychans;
+ num_by_chans_.Reshape(sz);
+ caffe_set(batch_sum_multiplier_.count(), Dtype(1),
+ batch_sum_multiplier_.mutable_cpu_data());
+ }
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ int num = bottom[0]->shape(0);
+ int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
+
+ // elementwise square
+ caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
+ temp_.mutable_cpu_data());
+
+ if (use_global_stats_) {
+ // use the stored mean/variance estimates. TODO(cdoersch): allow an option
+ // to use an unbiased variance estimate, like the paper does.
+ const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
+ caffe_cpu_scale(variance_.count(), scale_factor,
+ this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
+ caffe_cpu_scale(variance_.count(), scale_factor,
+ this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
+ } else {
// computes variance using var(X) = E(X^2) - (EX)^2
- // EX across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1. / (H_ * W_)), bottom_data,
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
- // EX across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
- spatial_mean_.cpu_data(),
- batch_sum_multiplier_.cpu_data(), Dtype(0),
- batch_mean_.mutable_cpu_data());
-
- // E(X^2) across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1. / (H_ * W_)), buffer_blob_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- spatial_variance_.mutable_cpu_data());
- // E(X^2) across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
- spatial_variance_.cpu_data(),
- batch_sum_multiplier_.cpu_data(), Dtype(0),
- batch_variance_.mutable_cpu_data());
-
- caffe_powx(batch_mean_.count(), batch_mean_.cpu_data(), Dtype(2),
- buffer_blob_.mutable_cpu_data()); // (EX)^2
- caffe_sub(batch_mean_.count(), batch_variance_.cpu_data(),
- buffer_blob_.cpu_data(),
- batch_variance_.mutable_cpu_data()); // variance
-
- // do mean and variance normalization
- // subtract mean
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
- C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(),
- batch_mean_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(-1),
- spatial_mean_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
-
- caffe_add(buffer_blob_.count(), bottom_data,
- buffer_blob_.cpu_data(), top_data);
-
- // normalize variance
- caffe_add_scalar(batch_variance_.count(), var_eps_,
- batch_variance_.mutable_cpu_data());
- caffe_powx(batch_variance_.count(),
- batch_variance_.cpu_data(), Dtype(0.5),
- batch_variance_.mutable_cpu_data());
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
- C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(),
- batch_variance_.cpu_data(), Dtype(0),
- spatial_variance_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_ * C_, H_ * W_, 1, Dtype(1),
- spatial_variance_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
-
- caffe_div(buffer_blob_.count(), const_top_data,
- buffer_blob_.cpu_data(), top_data);
-
- // Saving x_norm
- caffe_copy(buffer_blob_.count(), const_top_data,
- x_norm_.mutable_cpu_data());
- // scale
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
- spatial_variance_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
- caffe_mul(buffer_blob_.count(), top_data,
- buffer_blob_.cpu_data(), top_data);
-
- // shift
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0),
- spatial_mean_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_ * C_, H_ * W_, 1, Dtype(1),
- spatial_mean_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
- caffe_add(buffer_blob_.count(), const_top_data,
- buffer_blob_.cpu_data(), top_data);
+ caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
+ 1. / (num * spatial_dim), bottom_data,
+ spatial_sum_multiplier_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
+ mean_.mutable_cpu_data());
+ caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
+ 1. / (num * spatial_dim), temp_.cpu_data(),
+ spatial_sum_multiplier_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
+ variance_.mutable_cpu_data());
+ this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
+ this->blobs_[2]->mutable_cpu_data()[0] += 1;
+ caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
+ moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
+ Dtype m = Dtype(bottom[0]->count()/channels_);
+ caffe_cpu_axpby(variance_.count(), m/(m-1), variance_.cpu_data(),
+ moving_average_fraction_, this->blobs_[1]->mutable_cpu_data());
}
+ // elementwise square of mean
+ caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
+ temp_.mutable_cpu_data());
- template <typename Dtype>
- void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
- const vector<bool>& propagate_down,
- const vector<Blob<Dtype>*>& bottom) {
- const Dtype* top_diff = top[0]->cpu_diff();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-
- Dtype* scale_diff = this->blobs_[0]->mutable_cpu_diff();
- Dtype* shift_diff = this->blobs_[1]->mutable_cpu_diff();
- const Dtype* scale_data = this->blobs_[0]->cpu_data();
-
-// Propagate layer to parameters
- // gradient w.r.t. scale
- caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(),
- top_diff, buffer_blob_.mutable_cpu_data());
- // EX across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
- H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- spatial_variance_.mutable_cpu_diff());
- // EX across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_variance_.cpu_diff(),
- batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff);
-
- // gradient w.r.t. shift
- // EX across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
- H_ * W_, Dtype(1), top_diff,
- spatial_sum_multiplier_.cpu_data(),
- Dtype(0), spatial_mean_.mutable_cpu_diff());
- // EX across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_,
- Dtype(1), spatial_mean_.cpu_diff(),
- batch_sum_multiplier_.cpu_data(),
- Dtype(0), shift_diff);
+ caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
+ variance_.mutable_cpu_data()); // variance
-// Propagate down
+ // normalize variance
+ caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
+ caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+ variance_.mutable_cpu_data());
- // put scale * top_diff to buffer_blob_
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
- spatial_variance_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
- caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(),
- buffer_blob_.mutable_cpu_data());
-
- // use new top diff for computation
- caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(),
- buffer_blob_.cpu_data(), bottom_diff);
- // EX across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1), bottom_diff,
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
- // EX across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_mean_.cpu_data(),
- batch_sum_multiplier_.cpu_data(), Dtype(0),
- batch_mean_.mutable_cpu_data());
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(),
- batch_mean_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_mean_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- bottom_diff);
-
- caffe_mul(buffer_blob_.count(),
- x_norm_.cpu_data(), bottom_diff, bottom_diff);
-
- // EX across spatial
- caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_,
- H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
- // EX across batch
- caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_mean_.cpu_data(),
- batch_sum_multiplier_.cpu_data(), Dtype(0),
- batch_mean_.mutable_cpu_data());
-
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(),
- batch_mean_.cpu_data(), Dtype(0),
- spatial_mean_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_ * C_, H_ * W_, 1, Dtype(1),
- spatial_mean_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(1), bottom_diff);
-
- caffe_cpu_axpby(buffer_blob_.count(), Dtype(1),
- buffer_blob_.cpu_data(), Dtype(-1. / (N_ * H_ * W_)),
- bottom_diff);
-
- // put the squares of bottom into buffer_blob_
-// caffe_powx(buffer_blob_.count(), bottom_data, Dtype(2),
-// buffer_blob_.mutable_cpu_data());
+ // do mean and variance normalization
+ if (bottom[0] != top[0]) {
+ caffe_copy(bottom[0]->count(), bottom_data, top_data);
+ }
+ // subtract mean
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, -1, num_by_chans_.cpu_data(),
+ spatial_sum_multiplier_.cpu_data(), 1., top_data);
+ // replicate variance to input size
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, 1., num_by_chans_.cpu_data(),
+ spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
+ caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
+ // TODO(cdoersch): The caching is only needed because later in-place layers
+ // might clobber the data. Can we skip this if they won't?
+ caffe_copy(x_norm_.count(), top_data,
+ x_norm_.mutable_cpu_data());
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down,
+ const vector<Blob<Dtype>*>& bottom) {
+ CHECK(!use_global_stats_);
+ const Dtype* top_diff;
+ if (bottom[0] != top[0]) {
+ top_diff = top[0]->cpu_diff();
+ } else {
+ caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
+ top_diff = x_norm_.cpu_diff();
+ }
+ const Dtype* top_data = x_norm_.cpu_data();
+ Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+ int num = bottom[0]->shape()[0];
+ int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
+ // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
+ //
+ // dE(Y)/dX =
+ // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
+ // ./ sqrt(var(X) + eps)
+ //
+ // where \cdot and ./ are hadamard product and elementwise division,
+ // respectively, dE/dY is the top diff, and mean/var/sum are all computed
+ // along all dimensions except the channels dimension. In the above
+ // equation, the operations allow for expansion (i.e. broadcast) along all
+ // dimensions except the channels dimension where required.
+
+ // sum(dE/dY \cdot Y)
+ caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
+ caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
+ bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
+ mean_.mutable_cpu_data());
+
+ // reshape (broadcast) the above
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, 1., num_by_chans_.cpu_data(),
+ spatial_sum_multiplier_.cpu_data(), 0., bottom_diff);
+
+ // sum(dE/dY \cdot Y) \cdot Y
+ caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+ // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
+ caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
+ top_diff, spatial_sum_multiplier_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
+ mean_.mutable_cpu_data());
+ // reshape (broadcast) the above to make
+ // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
+ num_by_chans_.mutable_cpu_data());
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
+ spatial_dim, 1, 1., num_by_chans_.cpu_data(),
+ spatial_sum_multiplier_.cpu_data(), 1., bottom_diff);
+
+ // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
+ caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
+ Dtype(-1. / (num * spatial_dim)), bottom_diff);
+
+ // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
+ // pass.
+ caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
+}
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.cpu_data(),
- batch_variance_.cpu_data(), Dtype(0),
- spatial_variance_.mutable_cpu_data());
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
- N_ * C_, H_ * W_, 1, Dtype(1),
- spatial_variance_.cpu_data(),
- spatial_sum_multiplier_.cpu_data(), Dtype(0),
- buffer_blob_.mutable_cpu_data());
- caffe_div(buffer_blob_.count(), bottom_diff,
- buffer_blob_.cpu_data(), bottom_diff);
- }
#ifdef CPU_ONLY
STUB_GPU(BatchNormLayer);
#endif
- INSTANTIATE_CLASS(BatchNormLayer);
- REGISTER_LAYER_CLASS(BatchNorm);
+INSTANTIATE_CLASS(BatchNormLayer);
+REGISTER_LAYER_CLASS(BatchNorm);
} // namespace caffe
-
#include <vector>
#include "caffe/common_layers.hpp"
-#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
- template <typename Dtype>
- void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- const Dtype* bottom_data = bottom[0]->gpu_data();
- const Dtype* const_top_data = top[0]->gpu_data();
- Dtype* top_data = top[0]->mutable_gpu_data();
- Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data();
- Dtype* buffer_data = buffer_blob_.mutable_gpu_data();
- const Dtype* const_buffer_data = buffer_blob_.gpu_data();
-
-
- // put the squares of bottom into buffer_blob_
- caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
- buffer_blob_.mutable_gpu_data());
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ int num = bottom[0]->shape(0);
+ int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
+
+ // elementwise square
+ caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
+ temp_.mutable_gpu_data());
+
+ if (use_global_stats_) {
+ // use the stored mean/variance estimates. TODO(cdoersch): allow an option
+ // to use an unbiased variance estimate, like the paper does.
+ const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
+ caffe_gpu_scale(variance_.count(), scale_factor,
+ this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data());
+ caffe_gpu_scale(variance_.count(), scale_factor,
+ this->blobs_[1]->gpu_data(), variance_.mutable_gpu_data());
+ } else {
// computes variance using var(X) = E(X^2) - (EX)^2
- // EX across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1. / (H_ * W_)),
- bottom_data, spatial_sum_multiplier_.gpu_data(),
- Dtype(0), spatial_mean_data);
- // EX across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
- spatial_mean_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- batch_mean_.mutable_gpu_data());
-
- // E(X^2) across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1. / (H_ * W_)), buffer_data,
- spatial_sum_multiplier_.gpu_data(), Dtype(0),
- spatial_variance_.mutable_gpu_data());
- // E(X^2) across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_),
- spatial_variance_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- batch_variance_.mutable_gpu_data());
-
- caffe_gpu_powx(batch_mean_.count(), batch_mean_.gpu_data(),
- Dtype(2), buffer_blob_.mutable_gpu_data()); // (EX)^2
- caffe_gpu_sub(batch_mean_.count(), batch_variance_.gpu_data(),
- buffer_data, batch_variance_.mutable_gpu_data()); // variance
-
- // do mean and variance normalization
- // subtract mean
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(), batch_mean_.gpu_data(), Dtype(0),
- spatial_mean_data);
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_,
- 1, -Dtype(1),
- spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0),
- buffer_blob_.mutable_gpu_data());
-
- caffe_gpu_add(buffer_blob_.count(), bottom_data, buffer_data, top_data);
-
- // normalize variance
- caffe_gpu_add_scalar(batch_variance_.count(), var_eps_,
- batch_variance_.mutable_gpu_data());
- caffe_gpu_powx(batch_variance_.count(), batch_variance_.gpu_data(),
- Dtype(0.5), batch_variance_.mutable_gpu_data());
-
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0),
- spatial_variance_.mutable_gpu_data());
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
- Dtype(0), buffer_blob_.mutable_gpu_data());
-
- caffe_gpu_div(buffer_blob_.count(), top_data, buffer_data, top_data);
-
- // Saving x_norm
- caffe_copy(top[0]->count(), const_top_data, x_norm_.mutable_gpu_data());
-
- // scale
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
- Dtype(0), spatial_variance_.mutable_gpu_data());
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
- Dtype(0), buffer_blob_.mutable_gpu_data());
-
- caffe_gpu_mul(buffer_blob_.count(), top_data, buffer_data, top_data);
-
- // shift
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(),
- this->blobs_[1]->gpu_data(), Dtype(0),
- spatial_mean_data);
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1,
- Dtype(1),
- spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0),
- buffer_blob_.mutable_gpu_data());
- caffe_gpu_add(buffer_blob_.count(), top_data, buffer_data, top_data);
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
+ 1. / (num * spatial_dim), bottom_data,
+ spatial_sum_multiplier_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+ mean_.mutable_gpu_data());
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
+ 1. / (num * spatial_dim), temp_.gpu_data(),
+ spatial_sum_multiplier_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+ variance_.mutable_gpu_data());
+ this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
+ this->blobs_[2]->mutable_cpu_data()[0] += 1;
+ caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(),
+ moving_average_fraction_, this->blobs_[0]->mutable_gpu_data());
+ Dtype m = Dtype(bottom[0]->count()/channels_);
+ caffe_gpu_axpby(variance_.count(), m/(m-1), variance_.gpu_data(),
+ moving_average_fraction_, this->blobs_[1]->mutable_gpu_data());
}
+ // elementwise square of mean
+ caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
+ temp_.mutable_gpu_data());
+
+ caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
+ variance_.mutable_gpu_data()); // variance
+
+ // normalize variance
+ caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
+ caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+ variance_.mutable_gpu_data());
- template <typename Dtype>
- void BatchNormLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
- const vector<bool>& propagate_down,
- const vector<Blob<Dtype>*>& bottom) {
- const Dtype* top_diff = top[0]->gpu_diff();
- const Dtype* top_data = top[0]->gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
- const Dtype* const_bottom_diff = bottom[0]->gpu_diff();
- Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data();
- Dtype* buffer_data = buffer_blob_.mutable_gpu_data();
- const Dtype* const_buffer_data = buffer_blob_.gpu_data();
-
- // Propage to layer params
- // gradient w.r.t. scale
- caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
- top_diff, buffer_blob_.mutable_gpu_data());
- // EX across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
- buffer_data, spatial_sum_multiplier_.gpu_data(), Dtype(0),
- spatial_variance_.mutable_gpu_data());
- // EX across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_variance_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- this->blobs_[0]->mutable_gpu_diff());
-
- // gradient w.r.t. shift
- // EX across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
- top_diff, spatial_sum_multiplier_.gpu_data(),
- Dtype(0), spatial_mean_data);
- // EX across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_mean_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- this->blobs_[1]->mutable_gpu_diff());
-
- // Propagate down
- // scale top diff
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(),
- Dtype(0), spatial_variance_.mutable_gpu_data());
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
- Dtype(0),
- buffer_blob_.mutable_gpu_data());
- caffe_gpu_mul(buffer_blob_.count(), top_diff, buffer_data,
- buffer_blob_.mutable_gpu_data());
-
- // use new top diff for computation
- caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
- buffer_data, bottom_diff);
- // EX across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_,
- Dtype(1), bottom_diff,
- spatial_sum_multiplier_.gpu_data(), Dtype(0), spatial_mean_data);
- // EX across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_mean_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- batch_mean_.mutable_gpu_data());
-
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(),
- batch_mean_.gpu_data(), Dtype(0),
- spatial_mean_data);
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1), spatial_mean_.gpu_data(),
- spatial_sum_multiplier_.gpu_data(), Dtype(0),
- bottom_diff);
-
- caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(),
- bottom_diff, bottom_diff);
-
- // EX across spatial
- caffe_gpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1),
- buffer_data, spatial_sum_multiplier_.gpu_data(),
- Dtype(0), spatial_mean_data);
-
- // EX across batch
- caffe_gpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1),
- spatial_mean_.gpu_data(),
- batch_sum_multiplier_.gpu_data(), Dtype(0),
- batch_mean_.mutable_gpu_data());
-
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_,
- C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(),
- batch_mean_.gpu_data(), Dtype(0),
- spatial_mean_data);
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
- Dtype(1),
- bottom_diff);
-
- caffe_gpu_axpby(buffer_blob_.count(), Dtype(1), buffer_data,
- Dtype(-1. / (N_ * H_ * W_)),
- bottom_diff);
-
- // put the squares of bottom into buffer_blob_
-// caffe_gpu_powx(buffer_blob_.count(), bottom_data, Dtype(2),
-// buffer_blob_.mutable_gpu_data());
-
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
- batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0),
- spatial_variance_.mutable_gpu_data());
- caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_,
- H_ * W_, 1, Dtype(1),
- spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(),
- Dtype(0),
- buffer_blob_.mutable_gpu_data());
-
- caffe_gpu_div(buffer_blob_.count(), const_bottom_diff,
- const_buffer_data, bottom_diff);
+ // do mean and variance normalization
+ if (bottom[0] != top[0]) {
+ caffe_copy(bottom[0]->count(), bottom_data, top_data);
}
+ // subtract mean
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, -1, num_by_chans_.gpu_data(),
+ spatial_sum_multiplier_.gpu_data(), 1., top_data);
+ // replicate variance to input size
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.gpu_data(), variance_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, 1., num_by_chans_.gpu_data(),
+ spatial_sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+ caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
+ // TODO(cdoersch): The caching is only needed because later in-place layers
+ // might clobber the data. Can we skip this if they won't?
+ caffe_copy(x_norm_.count(), top_data,
+ x_norm_.mutable_gpu_data());
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down,
+ const vector<Blob<Dtype>*>& bottom) {
+ CHECK(!use_global_stats_);
+ const Dtype* top_diff;
+ if (bottom[0] != top[0]) {
+ top_diff = top[0]->gpu_diff();
+ } else {
+ caffe_copy(x_norm_.count(), top[0]->gpu_diff(), x_norm_.mutable_gpu_diff());
+ top_diff = x_norm_.gpu_diff();
+ }
+ const Dtype* top_data = x_norm_.gpu_data();
+ Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+ int num = bottom[0]->shape()[0];
+ int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
+ // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
+ //
+ // dE(Y)/dX =
+ // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
+ // ./ sqrt(var(X) + eps)
+ //
+ // where \cdot and ./ are hadamard product and elementwise division,
+ // respectively, dE/dY is the top diff, and mean/var/sum are all computed
+ // along all dimensions except the channels dimension. In the above
+ // equation, the operations allow for expansion (i.e. broadcast) along all
+ // dimensions except the channels dimension where required.
+
+ // sum(dE/dY \cdot Y)
+ caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
+ bottom_diff, spatial_sum_multiplier_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+ mean_.mutable_gpu_data());
+
+ // reshape (broadcast) the above
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
+ spatial_dim, 1, 1., num_by_chans_.gpu_data(),
+ spatial_sum_multiplier_.gpu_data(), 0., bottom_diff);
+
+ // sum(dE/dY \cdot Y) \cdot Y
+ caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+ // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
+ top_diff, spatial_sum_multiplier_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
+ num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+ mean_.mutable_gpu_data());
+ // reshape (broadcast) the above to make
+ // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
+ batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
+ num_by_chans_.mutable_gpu_data());
+ caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
+ spatial_dim, 1, 1., num_by_chans_.gpu_data(),
+ spatial_sum_multiplier_.gpu_data(), 1., bottom_diff);
+
+ // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
+ caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff,
+ Dtype(-1. / (num * spatial_dim)), bottom_diff);
+
+ // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
+ // pass.
+ caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(BatchNormLayer);
- INSTANTIATE_LAYER_GPU_FUNCS(BatchNormLayer);
-} // namespace caffe
+} // namespace caffe