virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+ /// Read the normalization mode parameter and compute the normalizer based
+ /// on the blob size. If normalization_mode is VALID, the count of valid
+ /// outputs will be read from valid_count, unless it is -1 in which case
+ /// all outputs are assumed to be valid.
+ virtual Dtype get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count);
/// The internal SoftmaxLayer used to map predictions to a distribution.
shared_ptr<Layer<Dtype> > softmax_layer_;
bool has_ignore_label_;
/// The label indicating that an instance should be ignored.
int ignore_label_;
- /// Whether to normalize the loss by the total number of values present
- /// (otherwise just by the batch size).
- bool normalize_;
+ /// How to normalize the output loss.
+ LossParameter_NormalizationMode normalization_;
int softmax_axis_, outer_num_, inner_num_;
};
if (has_ignore_label_) {
ignore_label_ = this->layer_param_.loss_param().ignore_label();
}
- normalize_ = this->layer_param_.loss_param().normalize();
+ if (!this->layer_param_.loss_param().has_normalization() &&
+ this->layer_param_.loss_param().has_normalize()) {
+ normalization_ = this->layer_param_.loss_param().normalize() ?
+ LossParameter_NormalizationMode_VALID :
+ LossParameter_NormalizationMode_BATCH_SIZE;
+ } else {
+ normalization_ = this->layer_param_.loss_param().normalization();
+ }
}
template <typename Dtype>
}
template <typename Dtype>
+Dtype SoftmaxWithLossLayer<Dtype>::get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count) {
+ Dtype normalizer;
+ switch (normalization_mode) {
+ case LossParameter_NormalizationMode_FULL:
+ normalizer = Dtype(outer_num_ * inner_num_);
+ break;
+ case LossParameter_NormalizationMode_VALID:
+ if (valid_count == -1) {
+ normalizer = Dtype(outer_num_ * inner_num_);
+ } else {
+ normalizer = Dtype(valid_count);
+ }
+ break;
+ case LossParameter_NormalizationMode_BATCH_SIZE:
+ normalizer = Dtype(outer_num_);
+ break;
+ case LossParameter_NormalizationMode_NONE:
+ normalizer = Dtype(1);
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization mode: "
+ << LossParameter_NormalizationMode_Name(normalization_mode);
+ }
+ // Some users will have no labels for some examples in order to 'turn off' a
+ // particular loss in a multi-task setup. The max prevents NaNs in that case.
+ return std::max(Dtype(1.0), normalizer);
+}
+
+template <typename Dtype>
void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
// The forward pass computes the softmax prob values.
++count;
}
}
- if (normalize_) {
- top[0]->mutable_cpu_data()[0] = loss / count;
- } else {
- top[0]->mutable_cpu_data()[0] = loss / outer_num_;
- }
+ top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
if (top.size() == 2) {
top[1]->ShareData(prob_);
}
}
}
// Scale gradient
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- if (normalize_) {
- caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
- } else {
- caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
- }
+ Dtype loss_weight = top[0]->cpu_diff()[0] /
+ get_normalizer(normalization_, count);
+ caffe_scal(prob_.count(), loss_weight, bottom_diff);
}
}
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
Dtype loss;
caffe_gpu_asum(nthreads, loss_data, &loss);
- if (normalize_) {
- Dtype count;
- caffe_gpu_asum(nthreads, counts, &count);
- loss /= count;
- } else {
- loss /= outer_num_;
+ Dtype valid_count = -1;
+ // Only launch another CUDA kernel if we actually need the count of valid
+ // outputs.
+ if (normalization_ == LossParameter_NormalizationMode_VALID &&
+ has_ignore_label_) {
+ caffe_gpu_asum(nthreads, counts, &valid_count);
}
- top[0]->mutable_cpu_data()[0] = loss;
+ top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+ valid_count);
if (top.size() == 2) {
top[1]->ShareData(prob_);
}
SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- if (normalize_) {
- Dtype count;
- caffe_gpu_asum(nthreads, counts, &count);
- caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
- } else {
- caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+
+ Dtype valid_count = -1;
+ // Only launch another CUDA kernel if we actually need the count of valid
+ // outputs.
+ if (normalization_ == LossParameter_NormalizationMode_VALID &&
+ has_ignore_label_) {
+ caffe_gpu_asum(nthreads, counts, &valid_count);
}
+ const Dtype loss_weight = top[0]->cpu_diff()[0] /
+ get_normalizer(normalization_, valid_count);
+ caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
}
}
message LossParameter {
// If specified, ignore instances with the given label.
optional int32 ignore_label = 1;
- // If true, normalize each batch across all instances (including spatial
- // dimesions, but not ignored instances); else, divide by batch size only.
- optional bool normalize = 2 [default = true];
+ // How to normalize the loss for loss layers that aggregate across batches,
+ // spatial dimensions, or other dimensions. Currently only implemented in
+ // SoftmaxWithLoss layer.
+ enum NormalizationMode {
+ // Divide by the number of examples in the batch times spatial dimensions.
+ // Outputs that receive the ignore label will NOT be ignored in computing
+ // the normalization factor.
+ FULL = 0;
+ // Divide by the total number of output locations that do not take the
+ // ignore_label. If ignore_label is not set, this behaves like FULL.
+ VALID = 1;
+ // Divide by the batch size.
+ BATCH_SIZE = 2;
+ // Do not normalize the loss.
+ NONE = 3;
+ }
+ optional NormalizationMode normalization = 3 [default = VALID];
+ // Deprecated. Ignored if normalization is specified. If normalization
+ // is not specified, then setting this to false will be equivalent to
+ // normalization = BATCH_SIZE to be consistent with previous behavior.
+ optional bool normalize = 2;
}
// Messages that store parameters used by individual layer types follow, in