Better normalization options for SoftmaxWithLoss layer.

author Carl Doersch <cdoersch@cs.cmu.edu>

Fri, 6 Nov 2015 22:41:30 +0000 (14:41 -0800)

committer Carl Doersch <cdoersch@cs.cmu.edu>

Sun, 22 Nov 2015 22:47:10 +0000 (14:47 -0800)
author Carl Doersch <cdoersch@cs.cmu.edu>
Fri, 6 Nov 2015 22:41:30 +0000 (14:41 -0800)
committer Carl Doersch <cdoersch@cs.cmu.edu>
Sun, 22 Nov 2015 22:47:10 +0000 (14:47 -0800)
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp

index d08ad9b..d6569c4 100644 (file)
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -747,6 +747,12 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
  
    /// The internal SoftmaxLayer used to map predictions to a distribution.
    shared_ptr<Layer<Dtype> > softmax_layer_;
@@ -760,9 +766,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
    bool has_ignore_label_;
    /// The label indicating that an instance should be ignored.
    int ignore_label_;
-  /// Whether to normalize the loss by the total number of values present
-  /// (otherwise just by the batch size).
-  bool normalize_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
  
    int softmax_axis_, outer_num_, inner_num_;
  };
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp

index dee50ac..3cdef82 100644 (file)
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -25,7 +25,14 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
    if (has_ignore_label_) {
      ignore_label_ = this->layer_param_.loss_param().ignore_label();
    }
-  normalize_ = this->layer_param_.loss_param().normalize();
+  if (!this->layer_param_.loss_param().has_normalization() &&
+      this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+                     LossParameter_NormalizationMode_VALID :
+                     LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  }
  }
  
  template <typename Dtype>
@@ -49,6 +56,36 @@ void SoftmaxWithLossLayer<Dtype>::Reshape(
  }
  
  template <typename Dtype>
+Dtype SoftmaxWithLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
+template <typename Dtype>
  void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
    // The forward pass computes the softmax prob values.
@@ -71,11 +108,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
        ++count;
      }
    }
-  if (normalize_) {
-    top[0]->mutable_cpu_data()[0] = loss / count;
-  } else {
-    top[0]->mutable_cpu_data()[0] = loss / outer_num_;
-  }
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
    if (top.size() == 2) {
      top[1]->ShareData(prob_);
    }
@@ -109,12 +142,9 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        }
      }
      // Scale gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(prob_.count(), loss_weight, bottom_diff);
    }
  }
  
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu

index 42e91fa..4753a1e 100644 (file)
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -49,14 +49,15 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
    Dtype loss;
    caffe_gpu_asum(nthreads, loss_data, &loss);
-  if (normalize_) {
-    Dtype count;
-    caffe_gpu_asum(nthreads, counts, &count);
-    loss /= count;
-  } else {
-    loss /= outer_num_;
+  Dtype valid_count = -1;
+  // Only launch another CUDA kernel if we actually need the count of valid
+  // outputs.
+  if (normalization_ == LossParameter_NormalizationMode_VALID &&
+      has_ignore_label_) {
+    caffe_gpu_asum(nthreads, counts, &valid_count);
    }
-  top[0]->mutable_cpu_data()[0] = loss;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+                                                        valid_count);
    if (top.size() == 2) {
      top[1]->ShareData(prob_);
    }
@@ -108,14 +109,17 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
          CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
          outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      Dtype count;
-      caffe_gpu_asum(nthreads, counts, &count);
-      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+
+    Dtype valid_count = -1;
+    // Only launch another CUDA kernel if we actually need the count of valid
+    // outputs.
+    if (normalization_ == LossParameter_NormalizationMode_VALID &&
+        has_ignore_label_) {
+      caffe_gpu_asum(nthreads, counts, &valid_count);
      }
+    const Dtype loss_weight = top[0]->cpu_diff()[0] /
+                              get_normalizer(normalization_, valid_count);
+    caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
    }
  }
  
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto

index 39873cf..787369f 100644 (file)
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -420,9 +420,27 @@ message TransformationParameter {
  message LossParameter {
    // If specified, ignore instances with the given label.
    optional int32 ignore_label = 1;
-  // If true, normalize each batch across all instances (including spatial
-  // dimesions, but not ignored instances); else, divide by batch size only.
-  optional bool normalize = 2 [default = true];
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss layer.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the 
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
  }
  
  // Messages that store parameters used by individual layer types follow, in
author	Carl Doersch <cdoersch@cs.cmu.edu>
	Fri, 6 Nov 2015 22:41:30 +0000 (14:41 -0800)
committer	Carl Doersch <cdoersch@cs.cmu.edu>
	Sun, 22 Nov 2015 22:47:10 +0000 (14:47 -0800)
include/caffe/loss_layers.hpp		patch \| blob \| history
src/caffe/layers/softmax_loss_layer.cpp		patch \| blob \| history
src/caffe/layers/softmax_loss_layer.cu		patch \| blob \| history
src/caffe/proto/caffe.proto		patch \| blob \| history