From: Carl Doersch Date: Fri, 6 Nov 2015 22:41:30 +0000 (-0800) Subject: Better normalization options for SoftmaxWithLoss layer. X-Git-Tag: submit/tizen/20180823.020014~287^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8b2aa7093cba002a5f286d47658de72a961d1299;p=platform%2Fupstream%2Fcaffeonacl.git Better normalization options for SoftmaxWithLoss layer. --- diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index d08ad9b..d6569c4 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -747,6 +747,12 @@ class SoftmaxWithLossLayer : public LossLayer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + /// Read the normalization mode parameter and compute the normalizer based + /// on the blob size. If normalization_mode is VALID, the count of valid + /// outputs will be read from valid_count, unless it is -1 in which case + /// all outputs are assumed to be valid. + virtual Dtype get_normalizer( + LossParameter_NormalizationMode normalization_mode, int valid_count); /// The internal SoftmaxLayer used to map predictions to a distribution. shared_ptr > softmax_layer_; @@ -760,9 +766,8 @@ class SoftmaxWithLossLayer : public LossLayer { bool has_ignore_label_; /// The label indicating that an instance should be ignored. int ignore_label_; - /// Whether to normalize the loss by the total number of values present - /// (otherwise just by the batch size). - bool normalize_; + /// How to normalize the output loss. + LossParameter_NormalizationMode normalization_; int softmax_axis_, outer_num_, inner_num_; }; diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index dee50ac..3cdef82 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -25,7 +25,14 @@ void SoftmaxWithLossLayer::LayerSetUp( if (has_ignore_label_) { ignore_label_ = this->layer_param_.loss_param().ignore_label(); } - normalize_ = this->layer_param_.loss_param().normalize(); + if (!this->layer_param_.loss_param().has_normalization() && + this->layer_param_.loss_param().has_normalize()) { + normalization_ = this->layer_param_.loss_param().normalize() ? + LossParameter_NormalizationMode_VALID : + LossParameter_NormalizationMode_BATCH_SIZE; + } else { + normalization_ = this->layer_param_.loss_param().normalization(); + } } template @@ -49,6 +56,36 @@ void SoftmaxWithLossLayer::Reshape( } template +Dtype SoftmaxWithLossLayer::get_normalizer( + LossParameter_NormalizationMode normalization_mode, int valid_count) { + Dtype normalizer; + switch (normalization_mode) { + case LossParameter_NormalizationMode_FULL: + normalizer = Dtype(outer_num_ * inner_num_); + break; + case LossParameter_NormalizationMode_VALID: + if (valid_count == -1) { + normalizer = Dtype(outer_num_ * inner_num_); + } else { + normalizer = Dtype(valid_count); + } + break; + case LossParameter_NormalizationMode_BATCH_SIZE: + normalizer = Dtype(outer_num_); + break; + case LossParameter_NormalizationMode_NONE: + normalizer = Dtype(1); + break; + default: + LOG(FATAL) << "Unknown normalization mode: " + << LossParameter_NormalizationMode_Name(normalization_mode); + } + // Some users will have no labels for some examples in order to 'turn off' a + // particular loss in a multi-task setup. The max prevents NaNs in that case. + return std::max(Dtype(1.0), normalizer); +} + +template void SoftmaxWithLossLayer::Forward_cpu( const vector*>& bottom, const vector*>& top) { // The forward pass computes the softmax prob values. @@ -71,11 +108,7 @@ void SoftmaxWithLossLayer::Forward_cpu( ++count; } } - if (normalize_) { - top[0]->mutable_cpu_data()[0] = loss / count; - } else { - top[0]->mutable_cpu_data()[0] = loss / outer_num_; - } + top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count); if (top.size() == 2) { top[1]->ShareData(prob_); } @@ -109,12 +142,9 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } // Scale gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - caffe_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } + Dtype loss_weight = top[0]->cpu_diff()[0] / + get_normalizer(normalization_, count); + caffe_scal(prob_.count(), loss_weight, bottom_diff); } } diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu index 42e91fa..4753a1e 100644 --- a/src/caffe/layers/softmax_loss_layer.cu +++ b/src/caffe/layers/softmax_loss_layer.cu @@ -49,14 +49,15 @@ void SoftmaxWithLossLayer::Forward_gpu( outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); Dtype loss; caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; + Dtype valid_count = -1; + // Only launch another CUDA kernel if we actually need the count of valid + // outputs. + if (normalization_ == LossParameter_NormalizationMode_VALID && + has_ignore_label_) { + caffe_gpu_asum(nthreads, counts, &valid_count); } - top[0]->mutable_cpu_data()[0] = loss; + top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, + valid_count); if (top.size() == 2) { top[1]->ShareData(prob_); } @@ -108,14 +109,17 @@ void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, SoftmaxLossBackwardGPU<<>>(nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + + Dtype valid_count = -1; + // Only launch another CUDA kernel if we actually need the count of valid + // outputs. + if (normalization_ == LossParameter_NormalizationMode_VALID && + has_ignore_label_) { + caffe_gpu_asum(nthreads, counts, &valid_count); } + const Dtype loss_weight = top[0]->cpu_diff()[0] / + get_normalizer(normalization_, valid_count); + caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff); } } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 39873cf..787369f 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -420,9 +420,27 @@ message TransformationParameter { message LossParameter { // If specified, ignore instances with the given label. optional int32 ignore_label = 1; - // If true, normalize each batch across all instances (including spatial - // dimesions, but not ignored instances); else, divide by batch size only. - optional bool normalize = 2 [default = true]; + // How to normalize the loss for loss layers that aggregate across batches, + // spatial dimensions, or other dimensions. Currently only implemented in + // SoftmaxWithLoss layer. + enum NormalizationMode { + // Divide by the number of examples in the batch times spatial dimensions. + // Outputs that receive the ignore label will NOT be ignored in computing + // the normalization factor. + FULL = 0; + // Divide by the total number of output locations that do not take the + // ignore_label. If ignore_label is not set, this behaves like FULL. + VALID = 1; + // Divide by the batch size. + BATCH_SIZE = 2; + // Do not normalize the loss. + NONE = 3; + } + optional NormalizationMode normalization = 3 [default = VALID]; + // Deprecated. Ignored if normalization is specified. If normalization + // is not specified, then setting this to false will be equivalent to + // normalization = BATCH_SIZE to be consistent with previous behavior. + optional bool normalize = 2; } // Messages that store parameters used by individual layer types follow, in