Added GPU implementation of SoftmaxWithLossLayer.
authorSagan Bolliger <sagan@cs.toronto.edu>
Fri, 6 Feb 2015 20:54:32 +0000 (12:54 -0800)
committerSagan Bolliger <sagan@cs.toronto.edu>
Fri, 6 Feb 2015 20:54:32 +0000 (12:54 -0800)
include/caffe/loss_layers.hpp
src/caffe/layers/softmax_loss_layer.cpp
src/caffe/layers/softmax_loss_layer.cu [new file with mode: 0644]

index f14b971..09a5c24 100644 (file)
@@ -707,8 +707,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   /// @copydoc SoftmaxWithLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // TODO(Yangqing): implement the GPU version of softmax.
-
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
   /**
    * @brief Computes the softmax loss error gradient w.r.t. the predictions.
    *
@@ -738,6 +738,9 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
 
   /// The internal SoftmaxLayer used to map predictions to a distribution.
   shared_ptr<Layer<Dtype> > softmax_layer_;
index bf20b60..8f6a4ef 100644 (file)
@@ -115,6 +115,10 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifdef CPU_ONLY
+STUB_GPU(SoftmaxWithLossLayer);
+#endif
+
 INSTANTIATE_CLASS(SoftmaxWithLossLayer);
 REGISTER_LAYER_CLASS(SoftmaxWithLoss);
 
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
new file mode 100644 (file)
index 0000000..215d589
--- /dev/null
@@ -0,0 +1,129 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void SoftmaxLossForwardGPU(const int nthreads,
+          const Dtype* prob_data, const Dtype* label, Dtype* loss,
+          const int num, const int dim, const int spatial_dim,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      loss[index] = 0;
+      counts[index] = 0;
+    } else {
+      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+                      Dtype(FLT_MIN)));
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.gpu_data();
+  const Dtype* label = bottom[1]->gpu_data();
+  const int num = prob_.num();
+  const int dim = prob_.count() / num;
+  const int spatial_dim = prob_.height() * prob_.width();
+  const int nthreads = num * spatial_dim;
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // Similarly, this memory is never used elsewhere, and thus we can use it
+  // to avoid having to allocate additional GPU memory.
+  Dtype* counts = prob_.mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
+      num, dim, spatial_dim, has_ignore_label_, ignore_label_, counts);
+  Dtype loss;
+  caffe_gpu_asum(nthreads, loss_data, &loss);
+  if (normalize_) {
+    Dtype count;
+    caffe_gpu_asum(nthreads, counts, &count);
+    loss /= count;
+  } else {
+    loss /= num;
+  }
+  top[0]->mutable_cpu_data()[0] = loss;
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
+}
+
+template <typename Dtype>
+__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
+          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+          const int spatial_dim, const bool has_ignore_label_,
+          const int ignore_label_, Dtype* counts) {
+  const int channels = dim / spatial_dim;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      for (int c = 0; c < channels; ++c) {
+        bottom_diff[n * dim + c * spatial_dim + s] = 0;
+      }
+      counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* prob_data = prob_.gpu_data();
+    const Dtype* top_data = top[0]->gpu_data();
+    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->gpu_data();
+    const int num = prob_.num();
+    const int dim = prob_.count() / num;
+    const int spatial_dim = prob_.height() * prob_.width();
+    const int nthreads = num * spatial_dim;
+    // Since this memory is never used for anything else,
+    // we use to to avoid allocating new GPU memory.
+    Dtype* counts = prob_.mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
+        num, dim, spatial_dim, has_ignore_label_, ignore_label_, counts);
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    if (normalize_) {
+      Dtype count;
+      caffe_gpu_asum(nthreads, counts, &count);
+      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_gpu_scal(prob_.count(), loss_weight / num, bottom_diff);
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer);
+
+}  // namespace caffe