vector<Blob<Dtype>*>* top) {
(*top)[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
bottom[0]->height(), bottom[0]->width());
- sum_multiplier_.Reshape(1, bottom[0]->channels(),
- bottom[0]->height(), bottom[0]->width());
+ sum_multiplier_.Reshape(1, bottom[0]->channels(), 1, 1);
Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
for (int i = 0; i < sum_multiplier_.count(); ++i) {
multiplier_data[i] = 1.;
}
- scale_.Reshape(bottom[0]->num(), 1, 1, 1);
+ scale_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
}
template <typename Dtype>
Dtype* top_data = (*top)[0]->mutable_cpu_data();
Dtype* scale_data = scale_.mutable_cpu_data();
int num = bottom[0]->num();
+ int channels = bottom[0]->channels();
int dim = bottom[0]->count() / bottom[0]->num();
+ int spatial_dim = bottom[0]->height() * bottom[0]->width();
caffe_copy(bottom[0]->count(), bottom_data, top_data);
- // we need to subtract the max to avoid numerical issues, compute the exp,
+ // We need to subtract the max to avoid numerical issues, compute the exp,
// and then normalize.
for (int i = 0; i < num; ++i) {
- scale_data[i] = bottom_data[i*dim];
- for (int j = 0; j < dim; ++j) {
- scale_data[i] = std::max(scale_data[i], bottom_data[i * dim + j]);
+ // initialize scale_data to the first plane
+ caffe_copy(spatial_dim, bottom_data + i * dim, scale_data);
+ for (int j = 0; j < channels; j++) {
+ for (int k = 0; k < spatial_dim; k++) {
+ scale_data[k] = std::max(scale_data[k],
+ bottom_data[i * dim + j * spatial_dim + k]);
+ }
+ }
+ // subtraction
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+ 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data + i * dim);
+ // exponentiation
+ caffe_exp<Dtype>(dim, top_data + i * dim, top_data + i * dim);
+ // sum after exp
+ caffe_cpu_gemv<Dtype>(CblasTrans, channels, spatial_dim, 1.,
+ top_data + i * dim, sum_multiplier_.cpu_data(), 0., scale_data);
+ // division
+ for (int j = 0; j < channels; j++) {
+ caffe_div(spatial_dim, top_data + (*top)[0]->offset(i, j), scale_data,
+ top_data + (*top)[0]->offset(i, j));
}
- }
- // subtraction
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
- scale_data, sum_multiplier_.cpu_data(), 1., top_data);
- // Perform exponentiation
- caffe_exp<Dtype>(num * dim, top_data, top_data);
- // sum after exp
- caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_data,
- sum_multiplier_.cpu_data(), 0., scale_data);
- // Do division
- for (int i = 0; i < num; ++i) {
- caffe_scal<Dtype>(dim, Dtype(1.) / scale_data[i], top_data + i * dim);
}
}
Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
Dtype* scale_data = scale_.mutable_cpu_data();
int num = top[0]->num();
+ int channels = top[0]->channels();
int dim = top[0]->count() / top[0]->num();
+ int spatial_dim = top[0]->height() * top[0]->width();
caffe_copy(top[0]->count(), top_diff, bottom_diff);
- // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff
for (int i = 0; i < num; ++i) {
- scale_data[i] = caffe_cpu_dot<Dtype>(dim, top_diff + i * dim,
- top_data + i * dim);
+ // compute dot(top_diff, top_data) and subtract them from the bottom diff
+ for (int k = 0; k < spatial_dim; ++k) {
+ scale_data[k] = caffe_cpu_strided_dot<Dtype>(channels,
+ bottom_diff + i * dim + k, spatial_dim,
+ top_data + i * dim + k, spatial_dim);
+ }
+ // subtraction
+ caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, spatial_dim, 1,
+ -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
}
- // subtraction
- caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
- scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff);
// elementwise multiplication
- caffe_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
+ caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
}
void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
// The forward pass computes the softmax prob values.
- softmax_bottom_vec_[0] = bottom[0];
softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
const Dtype* prob_data = prob_.cpu_data();
const Dtype* label = bottom[1]->cpu_data();
int num = prob_.num();
int dim = prob_.count() / num;
+ int spatial_dim = prob_.height() * prob_.width();
Dtype loss = 0;
for (int i = 0; i < num; ++i) {
- loss += -log(std::max(prob_data[i * dim + static_cast<int>(label[i])],
- Dtype(FLT_MIN)));
+ for (int j = 0; j < spatial_dim; j++) {
+ loss -= log(std::max(prob_data[i * dim +
+ static_cast<int>(label[i * spatial_dim + j]) * spatial_dim + j],
+ Dtype(FLT_MIN)));
+ }
}
- (*top)[0]->mutable_cpu_data()[0] = loss / num;
+ (*top)[0]->mutable_cpu_data()[0] = loss / num / spatial_dim;
if (top->size() == 2) {
(*top)[1]->ShareData(prob_);
}
const Dtype* label = (*bottom)[1]->cpu_data();
int num = prob_.num();
int dim = prob_.count() / num;
+ int spatial_dim = prob_.height() * prob_.width();
for (int i = 0; i < num; ++i) {
- bottom_diff[i * dim + static_cast<int>(label[i])] -= 1;
+ for (int j = 0; j < spatial_dim; ++j) {
+ bottom_diff[i * dim + static_cast<int>(label[i * spatial_dim + j])
+ * spatial_dim + j] -= 1;
+ }
}
// Scale gradient
const Dtype loss_weight = top[0]->cpu_diff()[0];
- caffe_scal(prob_.count(), loss_weight / num, bottom_diff);
+ caffe_scal(prob_.count(), loss_weight / num / spatial_dim, bottom_diff);
}
}