Merge pull request #3088 from lukeyeager/bvlc/lmdb-nolock
authorJon Long <jonlong@cs.berkeley.edu>
Wed, 21 Oct 2015 02:17:06 +0000 (19:17 -0700)
committerJon Long <jonlong@cs.berkeley.edu>
Wed, 21 Oct 2015 02:17:06 +0000 (19:17 -0700)
Open LMDB files with MDB_NOLOCK if no write access

72 files changed:
INSTALL.md
README.md
cmake/Dependencies.cmake
docs/installation.md
docs/tutorial/layers.md
docs/tutorial/solver.md
examples/00-classification.ipynb
examples/mnist/lenet_adadelta_solver.prototxt
examples/mnist/lenet_solver_adam.prototxt
examples/mnist/lenet_solver_rmsprop.prototxt
examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
examples/mnist/mnist_autoencoder_solver_adagrad.prototxt
examples/mnist/mnist_autoencoder_solver_nesterov.prototxt
examples/web_demo/requirements.txt
include/caffe/caffe.hpp
include/caffe/common_layers.hpp
include/caffe/sgd_solvers.hpp [new file with mode: 0644]
include/caffe/solver.hpp
include/caffe/solver_factory.hpp [new file with mode: 0644]
include/caffe/syncedmem.hpp
include/caffe/util/im2col.hpp
include/caffe/util/upgrade_proto.hpp
include/caffe/vision_layers.hpp
matlab/+caffe/+test/test_io.m [new file with mode: 0644]
matlab/+caffe/io.m
matlab/+caffe/private/caffe_.cpp
matlab/+caffe/run_tests.m
matlab/hdf5creation/store2hdf5.m
models/bvlc_reference_caffenet/train_val.prototxt
python/CMakeLists.txt
python/caffe/_caffe.cpp
python/caffe/draw.py
python/caffe/io.py
python/caffe/test/test_io.py [new file with mode: 0644]
src/caffe/layer_factory.cpp
src/caffe/layers/argmax_layer.cpp
src/caffe/layers/batch_reindex_layer.cpp [new file with mode: 0644]
src/caffe/layers/batch_reindex_layer.cu [new file with mode: 0644]
src/caffe/layers/cudnn_conv_layer.cpp
src/caffe/layers/cudnn_conv_layer.cu
src/caffe/layers/cudnn_lcn_layer.cpp [new file with mode: 0644]
src/caffe/layers/cudnn_lcn_layer.cu [new file with mode: 0644]
src/caffe/layers/cudnn_lrn_layer.cpp [new file with mode: 0644]
src/caffe/layers/cudnn_lrn_layer.cu [new file with mode: 0644]
src/caffe/layers/lrn_layer.cpp
src/caffe/layers/mvn_layer.cpp
src/caffe/layers/mvn_layer.cu
src/caffe/layers/prelu_layer.cu
src/caffe/layers/silence_layer.cpp
src/caffe/layers/silence_layer.cu
src/caffe/net.cpp
src/caffe/proto/caffe.proto
src/caffe/solver.cpp
src/caffe/solvers/adadelta_solver.cpp [new file with mode: 0644]
src/caffe/solvers/adagrad_solver.cpp [new file with mode: 0644]
src/caffe/solvers/adam_solver.cpp [new file with mode: 0644]
src/caffe/solvers/nesterov_solver.cpp [new file with mode: 0644]
src/caffe/solvers/rmsprop_solver.cpp [new file with mode: 0644]
src/caffe/solvers/sgd_solver.cpp [new file with mode: 0644]
src/caffe/syncedmem.cpp
src/caffe/test/test_argmax_layer.cpp
src/caffe/test/test_batch_reindex_layer.cpp [new file with mode: 0644]
src/caffe/test/test_gradient_based_solver.cpp
src/caffe/test/test_lrn_layer.cpp
src/caffe/test/test_solver.cpp
src/caffe/test/test_solver_factory.cpp [new file with mode: 0644]
src/caffe/test/test_upgrade_proto.cpp
src/caffe/util/im2col.cpp
src/caffe/util/im2col.cu
src/caffe/util/upgrade_proto.cpp
tools/caffe.cpp
tools/upgrade_solver_proto_text.cpp [new file with mode: 0644]

index 42fcf02..05c714d 100644 (file)
@@ -3,5 +3,5 @@
 See http://caffe.berkeleyvision.org/installation.html for the latest
 installation instructions.
 
-Check the issue tracker in case you need help:
-https://github.com/BVLC/caffe/issues
+Check the users group in case you need help:
+https://groups.google.com/forum/#!forum/caffe-users
index ebec286..44b9e62 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # Caffe
 
+[![Build Status](https://travis-ci.org/BVLC/caffe.svg?branch=master)](https://travis-ci.org/BVLC/caffe)
+[![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
+
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
 It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
 
index a77ac6d..5651e2b 100644 (file)
@@ -58,9 +58,9 @@ endif()
 include(cmake/Cuda.cmake)
 if(NOT HAVE_CUDA)
   if(CPU_ONLY)
-    message("-- CUDA is disabled. Building without it...")
+    message(STATUS "-- CUDA is disabled. Building without it...")
   else()
-    message("-- CUDA is not detected by cmake. Building without it...")
+    message(WARNING "-- CUDA is not detected by cmake. Building without it...")
   endif()
 
   # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
index 89a8c71..cce7ec3 100644 (file)
@@ -30,13 +30,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
+* cuDNN for GPU acceleration (v3)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. For now cuDNN v1 is integrated but see [PR #1731](https://github.com/BVLC/caffe/pull/1731) for v2.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v3; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
index eabc792..7362aac 100644 (file)
@@ -39,7 +39,7 @@ In contrast, other layers (with few exceptions) ignore the spatial structure of
     - `n * c_i * h_i * w_i`
 * Output
     - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
-* Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
+* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
 
       layer {
         name: "conv1"
@@ -83,7 +83,7 @@ The `Convolution` layer convolves the input image with a set of learnable filter
     - `n * c * h_i * w_i`
 * Output
     - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
-* Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
+* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
 
       layer {
         name: "pool1"
@@ -197,7 +197,7 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
 * Parameters (`ReLUParameter relu_param`)
     - Optional
         - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
-* Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
+* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
 
       layer {
         name: "relu1"
index b150f64..b719f71 100644 (file)
@@ -8,12 +8,12 @@ The responsibilities of learning are divided between the Solver for overseeing t
 
 The Caffe solvers are:
 
-- Stochastic Gradient Descent (`SGD`), 
-- AdaDelta (`ADADELTA`),
-- Adaptive Gradient (`ADAGRAD`),
-- Adam (`ADAM`),
-- Nesterov's Accelerated Gradient (`NESTEROV`) and
-- RMSprop (`RMSPROP`)
+- Stochastic Gradient Descent (`type: "SGD"`),
+- AdaDelta (`type: "AdaDelta"`),
+- Adaptive Gradient (`type: "AdaGrad"`),
+- Adam (`type: "Adam"`),
+- Nesterov's Accelerated Gradient (`type: "Nesterov"`) and
+- RMSprop (`type: "RMSProp"`)
 
 The solver
 
@@ -51,7 +51,7 @@ The parameter update $$\Delta W$$ is formed by the solver from the error gradien
 
 ### SGD
 
-**Stochastic gradient descent** (`solver_type: SGD`) updates the weights $$ W $$ by a linear combination of the negative gradient $$ \nabla L(W) $$ and the previous weight update $$ V_t $$.
+**Stochastic gradient descent** (`type: "SGD"`) updates the weights $$ W $$ by a linear combination of the negative gradient $$ \nabla L(W) $$ and the previous weight update $$ V_t $$.
 The **learning rate** $$ \alpha $$ is the weight of the negative gradient.
 The **momentum** $$ \mu $$ is the weight of the previous update.
 
@@ -113,7 +113,7 @@ If learning diverges (e.g., you start to see very large or `NaN` or `inf` loss v
 
 ### AdaDelta
 
-The **AdaDelta** (`solver_type: ADADELTA`) method (M. Zeiler [1]) is a "robust learning rate method". It is a gradient-based optimization method (like SGD). The update formulas are
+The **AdaDelta** (`type: "AdaDelta"`) method (M. Zeiler [1]) is a "robust learning rate method". It is a gradient-based optimization method (like SGD). The update formulas are
 
 $$
 \begin{align}
@@ -125,7 +125,7 @@ E[g^2]_t &= \delta{E[g^2]_{t-1} } + (1-\delta)g_{t}^2
 \end{align}
 $$
 
-and 
+and
 
 $$
 (W_{t+1})_i =
@@ -139,7 +139,7 @@ $$
 
 ### AdaGrad
 
-The **adaptive gradient** (`solver_type: ADAGRAD`) method (Duchi et al. [1]) is a gradient-based optimization method (like SGD) that attempts to "find needles in haystacks in the form of very predictive but rarely seen features," in Duchi et al.'s words.
+The **adaptive gradient** (`type: "AdaGrad"`) method (Duchi et al. [1]) is a gradient-based optimization method (like SGD) that attempts to "find needles in haystacks in the form of very predictive but rarely seen features," in Duchi et al.'s words.
 Given the update information from all previous iterations $$ \left( \nabla L(W) \right)_{t'} $$ for $$ t' \in \{1, 2, ..., t\} $$,
 the update formulas proposed by [1] are as follows, specified for each component $$i$$ of the weights $$W$$:
 
@@ -159,7 +159,7 @@ Note that in practice, for weights $$ W \in \mathcal{R}^d $$, AdaGrad implementa
 
 ### Adam
 
-The **Adam** (`solver_type: ADAM`), proposed in Kingma et al. [1], is a gradient-based optimization method (like SGD). This includes an "adaptive moment estimation" ($$m_t, v_t$$) and can be regarded as a generalization of AdaGrad. The update formulas are
+The **Adam** (`type: "Adam"`), proposed in Kingma et al. [1], is a gradient-based optimization method (like SGD). This includes an "adaptive moment estimation" ($$m_t, v_t$$) and can be regarded as a generalization of AdaGrad. The update formulas are
 
 $$
 (m_t)_i = \beta_1 (m_{t-1})_i + (1-\beta_1)(\nabla L(W_t))_i,\\
@@ -181,7 +181,7 @@ Kingma et al. [1] proposed to use $$\beta_1 = 0.9, \beta_2 = 0.999, \varepsilon
 
 ### NAG
 
-**Nesterov's accelerated gradient** (`solver_type: NESTEROV`) was proposed by Nesterov [1] as an "optimal" method of convex optimization, achieving a convergence rate of $$ \mathcal{O}(1/t^2) $$ rather than the $$ \mathcal{O}(1/t) $$.
+**Nesterov's accelerated gradient** (`type: "Nesterov"`) was proposed by Nesterov [1] as an "optimal" method of convex optimization, achieving a convergence rate of $$ \mathcal{O}(1/t^2) $$ rather than the $$ \mathcal{O}(1/t) $$.
 Though the required assumptions to achieve the $$ \mathcal{O}(1/t^2) $$ convergence typically will not hold for deep networks trained with Caffe (e.g., due to non-smoothness and non-convexity), in practice NAG can be a very effective method for optimizing certain types of deep learning architectures, as demonstrated for deep MNIST autoencoders by Sutskever et al. [2].
 
 The weight update formulas look very similar to the SGD updates given above:
@@ -206,10 +206,10 @@ What distinguishes the method from SGD is the weight setting $$ W $$ on which we
 
 ### RMSprop
 
-The **RMSprop** (`solver_type: RMSPROP`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
+The **RMSprop** (`type: "RMSProp"`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
 
 $$
-(v_t)_i = 
+(v_t)_i =
 \begin{cases}
 (v_{t-1})_i + \delta, &(\nabla L(W_t))_i(\nabla L(W_{t-1}))_i > 0\\
 (v_{t-1})_i \cdot (1-\delta), & \text{else}
index 46bbb19..89b7dd3 100644 (file)
    "source": [
     "net.blobs['data'].data[...] = transformer.preprocess('data', caffe.io.load_image(caffe_root + 'examples/images/cat.jpg'))\n",
     "out = net.forward()\n",
-    "print(\"Predicted class is #{}.\".format(out['prob'].argmax()))"
+    "print(\"Predicted class is #{}.\".format(out['prob'][0].argmax()))"
    ]
   },
   {
index 776d1e0..16176c0 100644 (file)
@@ -20,5 +20,5 @@ snapshot: 5000
 snapshot_prefix: "examples/mnist/lenet_adadelta"
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type: ADADELTA
+type: "AdaDelta"
 delta: 1e-6
index d22c571..4b5336b 100644 (file)
@@ -22,5 +22,5 @@ max_iter: 10000
 snapshot: 5000
 snapshot_prefix: "examples/mnist/lenet"
 # solver mode: CPU or GPU
-solver_type: ADAM
+type: "Adam"
 solver_mode: GPU
index 74dadc5..924b72d 100644 (file)
@@ -23,5 +23,5 @@ snapshot: 5000
 snapshot_prefix: "examples/mnist/lenet_rmsprop"
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type: RMSPROP
+type: "RMSProp"
 rms_decay: 0.98
index 065647d..26c4084 100644 (file)
@@ -16,4 +16,4 @@ snapshot: 10000
 snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train"
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type: ADADELTA
+type: "AdaDelta"
index cc0ed9e..065cdb2 100644 (file)
@@ -14,4 +14,4 @@ snapshot: 10000
 snapshot_prefix: "examples/mnist/mnist_autoencoder_adagrad_train"
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type: ADAGRAD
+type: "AdaGrad"
index 2a59fd4..c95e3fe 100644 (file)
@@ -17,4 +17,4 @@ snapshot_prefix: "examples/mnist/mnist_autoencoder_nesterov_train"
 momentum: 0.95
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type: NESTEROV
+type: "Nesterov"
index 68a5e1d..a339efb 100644 (file)
 #include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
+#include "caffe/solver_factory.hpp"
 #include "caffe/util/benchmark.hpp"
 #include "caffe/util/io.hpp"
+#include "caffe/util/upgrade_proto.hpp"
 #include "caffe/vision_layers.hpp"
 
 #endif  // CAFFE_CAFFE_HPP_
index 89bab8d..21a27d7 100644 (file)
@@ -21,7 +21,8 @@ namespace caffe {
  *
  * Intended for use after a classification layer to produce a prediction.
  * If parameter out_max_val is set to true, output is a vector of pairs
- * (max_ind, max_val) for each image.
+ * (max_ind, max_val) for each image. The axis parameter specifies an axis
+ * along which to maximise.
  *
  * NOTE: does not implement Backwards operation.
  */
@@ -34,7 +35,11 @@ class ArgMaxLayer : public Layer<Dtype> {
    *   - top_k (\b optional uint, default 1).
    *     the number @f$ K @f$ of maximal items to output.
    *   - out_max_val (\b optional bool, default false).
-   *     if set, output a vector of pairs (max_ind, max_val) for each image.
+   *     if set, output a vector of pairs (max_ind, max_val) unless axis is set then
+   *     output max_val along the specified axis.
+   *   - axis (\b optional int).
+   *     if set, maximise along the specified axis else maximise the flattened
+   *     trailing dimensions for each index of the first / num dimension.
    */
   explicit ArgMaxLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
@@ -54,7 +59,8 @@ class ArgMaxLayer : public Layer<Dtype> {
    *      the inputs @f$ x @f$
    * @param top output Blob vector (length 1)
    *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
-   *      @f$ (N \times 2 \times K \times 1) @f$
+   *      @f$ (N \times 2 \times K \times 1) @f$ unless axis set than e.g.
+   *      @f$ (N \times K \times H \times W) @f$ if axis == 1
    *      the computed outputs @f$
    *       y_n = \arg\max\limits_i x_{ni}
    *      @f$ (for @f$ K = 1 @f$).
@@ -68,9 +74,80 @@ class ArgMaxLayer : public Layer<Dtype> {
   }
   bool out_max_val_;
   size_t top_k_;
+  bool has_axis_;
+  int axis_;
 };
 
 /**
+ * @brief Index into the input blob along its first axis.
+ *
+ * This layer can be used to select, reorder, and even replicate examples in a
+ * batch.  The second blob is cast to int and treated as an index into the
+ * first axis of the first blob.
+ */
+template <typename Dtype>
+class BatchReindexLayer : public Layer<Dtype> {
+ public:
+  explicit BatchReindexLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "BatchReindex"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2+)
+   *   -# @f$ (N \times ...) @f$
+   *      the inputs @f$ x_1 @f$
+   *   -# @f$ (M) @f$
+   *      the inputs @f$ x_2 @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (M \times ...) @f$:
+   *      the reindexed array @f$
+   *        y = x_1[x_2]
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the reordered input.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient
+   *        with respect to the outputs
+   *   -# @f$ (M \times ...) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to concatenated outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2):
+   *   - @f$ \frac{\partial E}{\partial y} @f$ is de-indexed (summing where
+   *     required) back to the input x_1
+   *   - This layer cannot backprop to x_2, i.e. propagate_down[1] must be
+   *     false.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  struct pair_sort_first {
+    bool operator()(const std::pair<int, int> &left,
+                    const std::pair<int, int> &right) {
+      return left.first < right.first;
+    }
+  };
+  void check_batch_reindex(int initial_num, int final_num,
+                           const Dtype* ridx_data);
+};
+
+
+/**
  * @brief Takes at least two Blob%s and concatenates them along either the num
  *        or channel dimension, outputting the result.
  */
diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
new file mode 100644 (file)
index 0000000..1fc52d8
--- /dev/null
@@ -0,0 +1,148 @@
+#ifndef CAFFE_SGD_SOLVERS_HPP_
+#define CAFFE_SGD_SOLVERS_HPP_
+
+#include <string>
+#include <vector>
+
+#include "caffe/solver.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Optimizes the parameters of a Net using
+ *        stochastic gradient descent (SGD) with momentum.
+ */
+template <typename Dtype>
+class SGDSolver : public Solver<Dtype> {
+ public:
+  explicit SGDSolver(const SolverParameter& param)
+      : Solver<Dtype>(param) { PreSolve(); }
+  explicit SGDSolver(const string& param_file)
+      : Solver<Dtype>(param_file) { PreSolve(); }
+  virtual inline const char* type() const { return "SGD"; }
+
+  const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
+
+ protected:
+  void PreSolve();
+  Dtype GetLearningRate();
+  virtual void ApplyUpdate();
+  virtual void Normalize(int param_id);
+  virtual void Regularize(int param_id);
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  virtual void ClipGradients();
+  virtual void SnapshotSolverState(const string& model_filename);
+  virtual void SnapshotSolverStateToBinaryProto(const string& model_filename);
+  virtual void SnapshotSolverStateToHDF5(const string& model_filename);
+  virtual void RestoreSolverStateFromHDF5(const string& state_file);
+  virtual void RestoreSolverStateFromBinaryProto(const string& state_file);
+  // history maintains the historical momentum data.
+  // update maintains update related data and is not needed in snapshots.
+  // temp maintains other information that might be needed in computation
+  //   of gradients/updates and is not needed in snapshots
+  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
+
+  DISABLE_COPY_AND_ASSIGN(SGDSolver);
+};
+
+template <typename Dtype>
+class NesterovSolver : public SGDSolver<Dtype> {
+ public:
+  explicit NesterovSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) {}
+  explicit NesterovSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) {}
+  virtual inline const char* type() const { return "Nesterov"; }
+
+ protected:
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+  DISABLE_COPY_AND_ASSIGN(NesterovSolver);
+};
+
+template <typename Dtype>
+class AdaGradSolver : public SGDSolver<Dtype> {
+ public:
+  explicit AdaGradSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
+  explicit AdaGradSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
+  virtual inline const char* type() const { return "AdaGrad"; }
+
+ protected:
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  void constructor_sanity_check() {
+    CHECK_EQ(0, this->param_.momentum())
+        << "Momentum cannot be used with AdaGrad.";
+  }
+
+  DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
+};
+
+
+template <typename Dtype>
+class RMSPropSolver : public SGDSolver<Dtype> {
+ public:
+  explicit RMSPropSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
+  explicit RMSPropSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
+  virtual inline const char* type() const { return "RMSProp"; }
+
+ protected:
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  void constructor_sanity_check() {
+    CHECK_EQ(0, this->param_.momentum())
+        << "Momentum cannot be used with RMSProp.";
+    CHECK_GE(this->param_.rms_decay(), 0)
+        << "rms_decay should lie between 0 and 1.";
+    CHECK_LT(this->param_.rms_decay(), 1)
+        << "rms_decay should lie between 0 and 1.";
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
+};
+
+template <typename Dtype>
+class AdaDeltaSolver : public SGDSolver<Dtype> {
+ public:
+  explicit AdaDeltaSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
+  explicit AdaDeltaSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
+  virtual inline const char* type() const { return "AdaDelta"; }
+
+ protected:
+  void AdaDeltaPreSolve();
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+  DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
+};
+
+/**
+ * @brief AdamSolver, an algorithm for first-order gradient-based optimization
+ *        of stochastic objective functions, based on adaptive estimates of
+ *        lower-order moments. Described in [1].
+ *
+ * [1] D. P. Kingma and J. L. Ba, "ADAM: A Method for Stochastic Optimization."
+ *     arXiv preprint arXiv:1412.6980v8 (2014).
+ */
+template <typename Dtype>
+class AdamSolver : public SGDSolver<Dtype> {
+ public:
+  explicit AdamSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { AdamPreSolve();}
+  explicit AdamSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
+  virtual inline const char* type() const { return "Adam"; }
+
+ protected:
+  void AdamPreSolve();
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+  DISABLE_COPY_AND_ASSIGN(AdamSolver);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SGD_SOLVERS_HPP_
index 2ecf539..298a68f 100644 (file)
@@ -1,10 +1,11 @@
-#ifndef CAFFE_OPTIMIZATION_SOLVER_HPP_
-#define CAFFE_OPTIMIZATION_SOLVER_HPP_
+#ifndef CAFFE_SOLVER_HPP_
+#define CAFFE_SOLVER_HPP_
 #include <boost/function.hpp>
 #include <string>
 #include <vector>
 
 #include "caffe/net.hpp"
+#include "caffe/solver_factory.hpp"
 
 namespace caffe {
 
@@ -83,6 +84,10 @@ class Solver {
   }
 
   void CheckSnapshotWritePermissions();
+  /**
+   * @brief Returns the solver type.
+   */
+  virtual inline const char* type() const { return ""; }
 
  protected:
   // Make and apply the update value for the current iteration.
@@ -148,158 +153,6 @@ class WorkerSolver : public Solver<Dtype> {
   }
 };
 
-/**
- * @brief Optimizes the parameters of a Net using
- *        stochastic gradient descent (SGD) with momentum.
- */
-template <typename Dtype>
-class SGDSolver : public Solver<Dtype> {
- public:
-  explicit SGDSolver(const SolverParameter& param)
-      : Solver<Dtype>(param) { PreSolve(); }
-  explicit SGDSolver(const string& param_file)
-      : Solver<Dtype>(param_file) { PreSolve(); }
-
-  const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
-
- protected:
-  void PreSolve();
-  Dtype GetLearningRate();
-  virtual void ApplyUpdate();
-  virtual void Normalize(int param_id);
-  virtual void Regularize(int param_id);
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  virtual void ClipGradients();
-  virtual void SnapshotSolverState(const string& model_filename);
-  virtual void SnapshotSolverStateToBinaryProto(const string& model_filename);
-  virtual void SnapshotSolverStateToHDF5(const string& model_filename);
-  virtual void RestoreSolverStateFromHDF5(const string& state_file);
-  virtual void RestoreSolverStateFromBinaryProto(const string& state_file);
-  // history maintains the historical momentum data.
-  // update maintains update related data and is not needed in snapshots.
-  // temp maintains other information that might be needed in computation
-  //   of gradients/updates and is not needed in snapshots
-  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
-
-  DISABLE_COPY_AND_ASSIGN(SGDSolver);
-};
-
-template <typename Dtype>
-class NesterovSolver : public SGDSolver<Dtype> {
- public:
-  explicit NesterovSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) {}
-  explicit NesterovSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) {}
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(NesterovSolver);
-};
-
-template <typename Dtype>
-class AdaGradSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaGradSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit AdaGradSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with AdaGrad.";
-  }
-
-  DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
-};
-
-
-template <typename Dtype>
-class RMSPropSolver : public SGDSolver<Dtype> {
- public:
-  explicit RMSPropSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit RMSPropSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with RMSProp.";
-    CHECK_GE(this->param_.rms_decay(), 0)
-        << "rms_decay should lie between 0 and 1.";
-    CHECK_LT(this->param_.rms_decay(), 1)
-        << "rms_decay should lie between 0 and 1.";
-  }
-
-  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
-};
-
-template <typename Dtype>
-class AdaDeltaSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaDeltaSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
-  explicit AdaDeltaSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
-
- protected:
-  void AdaDeltaPreSolve();
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
-};
-
-/**
- * @brief AdamSolver, an algorithm for first-order gradient-based optimization
- *        of stochastic objective functions, based on adaptive estimates of
- *        lower-order moments. Described in [1].
- *
- * [1] D. P. Kingma and J. L. Ba, "ADAM: A Method for Stochastic Optimization."
- *     arXiv preprint arXiv:1412.6980v8 (2014).
- */
-template <typename Dtype>
-class AdamSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdamSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { AdamPreSolve();}
-  explicit AdamSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
-
- protected:
-  void AdamPreSolve();
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(AdamSolver);
-};
-
-template <typename Dtype>
-Solver<Dtype>* GetSolver(const SolverParameter& param) {
-  SolverParameter_SolverType type = param.solver_type();
-
-  switch (type) {
-  case SolverParameter_SolverType_SGD:
-    return new SGDSolver<Dtype>(param);
-  case SolverParameter_SolverType_NESTEROV:
-    return new NesterovSolver<Dtype>(param);
-  case SolverParameter_SolverType_ADAGRAD:
-    return new AdaGradSolver<Dtype>(param);
-  case SolverParameter_SolverType_RMSPROP:
-    return new RMSPropSolver<Dtype>(param);
-  case SolverParameter_SolverType_ADADELTA:
-    return new AdaDeltaSolver<Dtype>(param);
-  case SolverParameter_SolverType_ADAM:
-    return new AdamSolver<Dtype>(param);
-  default:
-    LOG(FATAL) << "Unknown SolverType: " << type;
-  }
-  return (Solver<Dtype>*) NULL;
-}
-
 }  // namespace caffe
 
-#endif  // CAFFE_OPTIMIZATION_SOLVER_HPP_
+#endif  // CAFFE_SOLVER_HPP_
diff --git a/include/caffe/solver_factory.hpp b/include/caffe/solver_factory.hpp
new file mode 100644 (file)
index 0000000..cfff721
--- /dev/null
@@ -0,0 +1,137 @@
+/**
+ * @brief A solver factory that allows one to register solvers, similar to
+ * layer factory. During runtime, registered solvers could be called by passing
+ * a SolverParameter protobuffer to the CreateSolver function:
+ *
+ *     SolverRegistry<Dtype>::CreateSolver(param);
+ *
+ * There are two ways to register a solver. Assuming that we have a solver like:
+ *
+ *   template <typename Dtype>
+ *   class MyAwesomeSolver : public Solver<Dtype> {
+ *     // your implementations
+ *   };
+ *
+ * and its type is its C++ class name, but without the "Solver" at the end
+ * ("MyAwesomeSolver" -> "MyAwesome").
+ *
+ * If the solver is going to be created simply by its constructor, in your c++
+ * file, add the following line:
+ *
+ *    REGISTER_SOLVER_CLASS(MyAwesome);
+ *
+ * Or, if the solver is going to be created by another creator function, in the
+ * format of:
+ *
+ *    template <typename Dtype>
+ *    Solver<Dtype*> GetMyAwesomeSolver(const SolverParameter& param) {
+ *      // your implementation
+ *    }
+ *
+ * then you can register the creator function instead, like
+ *
+ * REGISTER_SOLVER_CREATOR(MyAwesome, GetMyAwesomeSolver)
+ *
+ * Note that each solver type should only be registered once.
+ */
+
+#ifndef CAFFE_SOLVER_FACTORY_H_
+#define CAFFE_SOLVER_FACTORY_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype>
+class Solver;
+
+template <typename Dtype>
+class SolverRegistry {
+ public:
+  typedef Solver<Dtype>* (*Creator)(const SolverParameter&);
+  typedef std::map<string, Creator> CreatorRegistry;
+
+  static CreatorRegistry& Registry() {
+    static CreatorRegistry* g_registry_ = new CreatorRegistry();
+    return *g_registry_;
+  }
+
+  // Adds a creator.
+  static void AddCreator(const string& type, Creator creator) {
+    CreatorRegistry& registry = Registry();
+    CHECK_EQ(registry.count(type), 0)
+        << "Solver type " << type << " already registered.";
+    registry[type] = creator;
+  }
+
+  // Get a solver using a SolverParameter.
+  static Solver<Dtype>* CreateSolver(const SolverParameter& param) {
+    const string& type = param.type();
+    CreatorRegistry& registry = Registry();
+    CHECK_EQ(registry.count(type), 1) << "Unknown solver type: " << type
+        << " (known types: " << SolverTypeListString() << ")";
+    return registry[type](param);
+  }
+
+  static vector<string> SolverTypeList() {
+    CreatorRegistry& registry = Registry();
+    vector<string> solver_types;
+    for (typename CreatorRegistry::iterator iter = registry.begin();
+         iter != registry.end(); ++iter) {
+      solver_types.push_back(iter->first);
+    }
+    return solver_types;
+  }
+
+ private:
+  // Solver registry should never be instantiated - everything is done with its
+  // static variables.
+  SolverRegistry() {}
+
+  static string SolverTypeListString() {
+    vector<string> solver_types = SolverTypeList();
+    string solver_types_str;
+    for (vector<string>::iterator iter = solver_types.begin();
+         iter != solver_types.end(); ++iter) {
+      if (iter != solver_types.begin()) {
+        solver_types_str += ", ";
+      }
+      solver_types_str += *iter;
+    }
+    return solver_types_str;
+  }
+};
+
+
+template <typename Dtype>
+class SolverRegisterer {
+ public:
+  SolverRegisterer(const string& type,
+      Solver<Dtype>* (*creator)(const SolverParameter&)) {
+    // LOG(INFO) << "Registering solver type: " << type;
+    SolverRegistry<Dtype>::AddCreator(type, creator);
+  }
+};
+
+
+#define REGISTER_SOLVER_CREATOR(type, creator)                                 \
+  static SolverRegisterer<float> g_creator_f_##type(#type, creator<float>);    \
+  static SolverRegisterer<double> g_creator_d_##type(#type, creator<double>)   \
+
+#define REGISTER_SOLVER_CLASS(type)                                            \
+  template <typename Dtype>                                                    \
+  Solver<Dtype>* Creator_##type##Solver(                                       \
+      const SolverParameter& param)                                            \
+  {                                                                            \
+    return new type##Solver<Dtype>(param);                                     \
+  }                                                                            \
+  REGISTER_SOLVER_CREATOR(type, Creator_##type##Solver)
+
+}  // namespace caffe
+
+#endif  // CAFFE_SOLVER_FACTORY_H_
index 62aadef..3d92a0e 100644 (file)
@@ -13,20 +13,22 @@ namespace caffe {
 // The improvement in performance seems negligible in the single GPU case,
 // but might be more significant for parallel training. Most importantly,
 // it improved stability for large models on many GPUs.
-inline void CaffeMallocHost(void** ptr, size_t size) {
+inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
 #ifndef CPU_ONLY
   if (Caffe::mode() == Caffe::GPU) {
     CUDA_CHECK(cudaMallocHost(ptr, size));
+    *use_cuda = true;
     return;
   }
 #endif
   *ptr = malloc(size);
+  *use_cuda = false;
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
-inline void CaffeFreeHost(void* ptr) {
+inline void CaffeFreeHost(void* ptr, bool use_cuda) {
 #ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
+  if (use_cuda) {
     CUDA_CHECK(cudaFreeHost(ptr));
     return;
   }
@@ -45,10 +47,12 @@ class SyncedMemory {
  public:
   SyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
+        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
+        gpu_device_(-1) {}
   explicit SyncedMemory(size_t size)
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
+        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
+        gpu_device_(-1) {}
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
@@ -72,6 +76,7 @@ class SyncedMemory {
   size_t size_;
   SyncedHead head_;
   bool own_cpu_data_;
+  bool cpu_malloc_use_cuda_;
   bool own_gpu_data_;
   int gpu_device_;
 
index 531fd29..d3eb6cc 100644 (file)
@@ -23,7 +23,7 @@ void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
@@ -47,7 +47,7 @@ void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
index c1f21a0..c94bb3c 100644 (file)
@@ -10,6 +10,15 @@ namespace caffe {
 // Return true iff the net is not the current version.
 bool NetNeedsUpgrade(const NetParameter& net_param);
 
+// Check for deprecations and upgrade the NetParameter as needed.
+bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
+
+// Read parameters from a file into a NetParameter proto message.
+void ReadNetParamsFromTextFileOrDie(const string& param_file,
+                                    NetParameter* param);
+void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
+                                      NetParameter* param);
+
 // Return true iff any layer contains parameters specified using
 // deprecated V0LayerParameter.
 bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param);
@@ -50,14 +59,17 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type);
 
-// Check for deprecations and upgrade the NetParameter as needed.
-bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
+// Return true iff the solver contains any old solver_type specified as enums
+bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param);
 
-// Read parameters from a file into a NetParameter proto message.
-void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param);
-void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param);
+bool UpgradeSolverType(SolverParameter* solver_param);
+
+// Check for deprecations and upgrade the SolverParameter as needed.
+bool UpgradeSolverAsNeeded(const string& param_file, SolverParameter* param);
+
+// Read parameters from a file into a SolverParameter proto message.
+void ReadSolverParamsFromTextFileOrDie(const string& param_file,
+                                       SolverParameter* param);
 
 }  // namespace caffe
 
index 06bc045..237b05d 100644 (file)
@@ -304,13 +304,24 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   bool handles_setup_;
   cudnnHandle_t* handle_;
   cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
   vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
   cudnnTensorDescriptor_t    bias_desc_;
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
   int bottom_offset_, top_offset_, bias_offset_;
-  size_t workspaceSizeInBytes;
-  void *workspace;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
 };
 #endif
 
@@ -442,6 +453,65 @@ class LRNLayer : public Layer<Dtype> {
   vector<Blob<Dtype>*> product_bottom_vec_;
 };
 
+#ifdef USE_CUDNN
+
+template <typename Dtype>
+class CuDNNLRNLayer : public LRNLayer<Dtype> {
+ public:
+  explicit CuDNNLRNLayer(const LayerParameter& param)
+      : LRNLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNLRNLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnLRNDescriptor_t norm_desc_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+
+  int size_;
+  Dtype alpha_, beta_, k_;
+};
+
+template <typename Dtype>
+class CuDNNLCNLayer : public LRNLayer<Dtype> {
+ public:
+  explicit CuDNNLCNLayer(const LayerParameter& param)
+      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
+        tempData1(NULL), tempData2(NULL) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNLCNLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnLRNDescriptor_t norm_desc_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+
+  int size_, pre_pad_;
+  Dtype alpha_, beta_, k_;
+
+  size_t tempDataSize;
+  void *tempData1, *tempData2;
+};
+
+#endif
 
 /**
  * @brief Pools the input image by taking the max, average, etc. within regions.
diff --git a/matlab/+caffe/+test/test_io.m b/matlab/+caffe/+test/test_io.m
new file mode 100644 (file)
index 0000000..2c34bd1
--- /dev/null
@@ -0,0 +1,18 @@
+classdef test_io < matlab.unittest.TestCase
+  methods (Test)
+    function test_read_write_mean(self)
+      % randomly generate mean data
+      width = 200;
+      height = 300;
+      channels = 3;
+      mean_data_write = 255 * rand(width, height, channels, 'single');
+      % write mean data to binary proto
+      mean_proto_file = tempname();
+      caffe.io.write_mean(mean_data_write, mean_proto_file);
+      % read mean data from saved binary proto and test whether they are equal
+      mean_data_read = caffe.io.read_mean(mean_proto_file);
+      self.verifyEqual(mean_data_write, mean_data_read)
+      delete(mean_proto_file);
+    end
+  end
+end
index af8369d..4b072fe 100644 (file)
@@ -29,5 +29,13 @@ classdef io
       CHECK_FILE_EXIST(mean_proto_file);
       mean_data = caffe_('read_mean', mean_proto_file);
     end
+    function write_mean(mean_data, mean_proto_file)
+      % write_mean(mean_data, mean_proto_file)
+      %   write image mean data to binaryproto file
+      %   mean_data should be W x H x C with BGR channels
+      CHECK(ischar(mean_proto_file), 'mean_proto_file must be a string');
+      CHECK(isa(mean_data, 'single'), 'mean_data must be a SINGLE matrix');
+      caffe_('write_mean', mean_data, mean_proto_file);
+    end   
   end
 end
index 4e0ebc1..1641e14 100644 (file)
@@ -188,7 +188,10 @@ static void get_solver(MEX_ARGS) {
       "Usage: caffe_('get_solver', solver_file)");
   char* solver_file = mxArrayToString(prhs[0]);
   mxCHECK_FILE_EXIST(solver_file);
-  shared_ptr<Solver<float> > solver(new caffe::SGDSolver<float>(solver_file));
+  SolverParameter solver_param;
+  ReadSolverParamsFromTextFileOrDie(solver_file, &solver_param);
+  shared_ptr<Solver<float> > solver(
+      SolverRegistry<float>::CreateSolver(solver_param));
   solvers_.push_back(solver);
   plhs[0] = ptr_to_handle<Solver<float> >(solver.get());
   mxFree(solver_file);
@@ -478,6 +481,29 @@ static void read_mean(MEX_ARGS) {
   mxFree(mean_proto_file);
 }
 
+// Usage: caffe_('write_mean', mean_data, mean_proto_file)
+static void write_mean(MEX_ARGS) {
+  mxCHECK(nrhs == 2 && mxIsSingle(prhs[0]) && mxIsChar(prhs[1]),
+      "Usage: caffe_('write_mean', mean_data, mean_proto_file)");
+  char* mean_proto_file = mxArrayToString(prhs[1]);
+  int ndims = mxGetNumberOfDimensions(prhs[0]);
+  mxCHECK(ndims >= 2 && ndims <= 3, "mean_data must have at 2 or 3 dimensions");
+  const mwSize *dims = mxGetDimensions(prhs[0]);
+  int width = dims[0];
+  int height = dims[1];
+  int channels;
+  if (ndims == 3)
+    channels = dims[2];
+  else
+    channels = 1;
+  Blob<float> data_mean(1, channels, height, width);
+  mx_mat_to_blob(prhs[0], &data_mean, DATA);
+  BlobProto blob_proto;
+  data_mean.ToProto(&blob_proto, false);
+  WriteProtoToBinaryFile(blob_proto, mean_proto_file);
+  mxFree(mean_proto_file);
+}
+
 /** -----------------------------------------------------------------
  ** Available commands.
  **/
@@ -515,6 +541,7 @@ static handler_registry handlers[] = {
   { "get_init_key",       get_init_key    },
   { "reset",              reset           },
   { "read_mean",          read_mean       },
+  { "write_mean",         write_mean      },
   // The end.
   { "END",                NULL            },
 };
index 9389685..6dbf6b2 100644 (file)
@@ -11,7 +11,8 @@ caffe.reset_all();
 % put all test cases here
 results = [...
   run(caffe.test.test_net) ...
-  run(caffe.test.test_solver) ];
+  run(caffe.test.test_solver) ...
+  run(caffe.test.test_io) ];
 
 % reset caffe after testing
 caffe.reset_all();
index 0a0016d..4e8c81d 100644 (file)
@@ -39,8 +39,8 @@ function [curr_dat_sz, curr_lab_sz] = store2hdf5(filename, data, labels, create,
       info=h5info(filename);
       prev_dat_sz=info.Datasets(1).Dataspace.Size;
       prev_lab_sz=info.Datasets(2).Dataspace.Size;
-      assert(prev_dat_sz(1:end-1)==dat_dims(1:end-1), 'Data dimensions must match existing dimensions in dataset');
-      assert(prev_lab_sz(1:end-1)==lab_dims(1:end-1), 'Label dimensions must match existing dimensions in dataset');
+      assert(all(prev_dat_sz(1:end-1)==dat_dims(1:end-1)), 'Data dimensions must match existing dimensions in dataset');
+      assert(all(prev_lab_sz(1:end-1)==lab_dims(1:end-1)), 'Label dimensions must match existing dimensions in dataset');
       startloc.dat=[ones(1,length(dat_dims)-1), prev_dat_sz(end)+1];
       startloc.lab=[ones(1,length(lab_dims)-1), prev_lab_sz(end)+1];
     end
index c79472e..e3e4279 100644 (file)
@@ -45,7 +45,7 @@ layer {
 #    mean_value: 104
 #    mean_value: 117
 #    mean_value: 123
-#    mirror: true
+#    mirror: false
 #  }
   data_param {
     source: "examples/imagenet/ilsvrc12_val_lmdb"
index 0e2bc7e..a226414 100644 (file)
@@ -1,5 +1,5 @@
 if(NOT HAVE_PYTHON)
-  message(STATUS "Python interface is disabled or not all required dependecies found. Building without it...")
+  message(STATUS "Python interface is disabled or not all required dependencies found. Building without it...")
   return()
 endif()
 
index ccd5776..8687dd8 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "caffe/caffe.hpp"
 #include "caffe/python_layer.hpp"
+#include "caffe/sgd_solvers.hpp"
 
 // Temporary solution for numpy < 1.7 versions: old macro, no promises.
 // You're strongly advised to upgrade to >= 1.7.
@@ -133,8 +134,8 @@ void Net_SetInputArrays(Net<Dtype>* net, bp::object data_obj,
 
 Solver<Dtype>* GetSolverFromFile(const string& filename) {
   SolverParameter param;
-  ReadProtoFromTextFileOrDie(filename, &param);
-  return GetSolver<Dtype>(param);
+  ReadSolverParamsFromTextFileOrDie(filename, &param);
+  return SolverRegistry<Dtype>::CreateSolver(param);
 }
 
 struct NdarrayConverterGenerator {
index a002b60..f8bf572 100644 (file)
@@ -82,11 +82,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
                       separator,
-                      layer.convolution_param.stride,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
                       separator,
-                      layer.convolution_param.pad)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
index 0cad721..11c8426 100644 (file)
@@ -20,23 +20,26 @@ def blobproto_to_array(blob, return_diff=False):
     Convert a blob proto to an array. In default, we will just return the data,
     unless return_diff is True, in which case we will return the diff.
     """
+    # Read the data into an array
     if return_diff:
-        return np.array(blob.diff).reshape(
-            blob.num, blob.channels, blob.height, blob.width)
+        data = np.array(blob.diff)
     else:
-        return np.array(blob.data).reshape(
-            blob.num, blob.channels, blob.height, blob.width)
+        data = np.array(blob.data)
 
+    # Reshape the array
+    if blob.HasField('num') or blob.HasField('channels') or blob.HasField('height') or blob.HasField('width'):
+        # Use legacy 4D shape
+        return data.reshape(blob.num, blob.channels, blob.height, blob.width)
+    else:
+        return data.reshape(blob.shape.dim)
 
 def array_to_blobproto(arr, diff=None):
-    """Converts a 4-dimensional array to blob proto. If diff is given, also
+    """Converts a N-dimensional array to blob proto. If diff is given, also
     convert the diff. You need to make sure that arr and diff have the same
     shape, and this function does not do sanity check.
     """
-    if arr.ndim != 4:
-        raise ValueError('Incorrect array shape.')
     blob = caffe_pb2.BlobProto()
-    blob.num, blob.channels, blob.height, blob.width = arr.shape
+    blob.shape.dim.extend(arr.shape)
     blob.data.extend(arr.astype(float).flat)
     if diff is not None:
         blob.diff.extend(diff.astype(float).flat)
diff --git a/python/caffe/test/test_io.py b/python/caffe/test/test_io.py
new file mode 100644 (file)
index 0000000..8c86ef7
--- /dev/null
@@ -0,0 +1,41 @@
+import numpy as np
+import unittest
+
+import caffe
+
+class TestBlobProtoToArray(unittest.TestCase):
+
+    def test_old_format(self):
+        data = np.zeros((10,10))
+        blob = caffe.proto.caffe_pb2.BlobProto()
+        blob.data.extend(list(data.flatten()))
+        shape = (1,1,10,10)
+        blob.num, blob.channels, blob.height, blob.width = shape
+
+        arr = caffe.io.blobproto_to_array(blob)
+        self.assertEqual(arr.shape, shape)
+
+    def test_new_format(self):
+        data = np.zeros((10,10))
+        blob = caffe.proto.caffe_pb2.BlobProto()
+        blob.data.extend(list(data.flatten()))
+        blob.shape.dim.extend(list(data.shape))
+
+        arr = caffe.io.blobproto_to_array(blob)
+        self.assertEqual(arr.shape, data.shape)
+
+    def test_no_shape(self):
+        data = np.zeros((10,10))
+        blob = caffe.proto.caffe_pb2.BlobProto()
+        blob.data.extend(list(data.flatten()))
+
+        with self.assertRaises(ValueError):
+            caffe.io.blobproto_to_array(blob)
+
+    def test_scalar(self):
+        data = np.ones((1)) * 123
+        blob = caffe.proto.caffe_pb2.BlobProto()
+        blob.data.extend(list(data.flatten()))
+
+        arr = caffe.io.blobproto_to_array(blob)
+        self.assertEqual(arr, 123)
index 926c7d8..417ffe9 100644 (file)
@@ -54,10 +54,8 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
     return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == PoolingParameter_Engine_CUDNN) {
-    PoolingParameter p_param = param.pooling_param();
-    if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
-        param.top_size() > 1) {
-      LOG(INFO) << "CUDNN does not support padding or multiple tops. "
+    if (param.top_size() > 1) {
+      LOG(INFO) << "cuDNN does not support multiple tops. "
                 << "Using Caffe's own pooling layer.";
       return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
     }
@@ -70,6 +68,43 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 
+// Get LRN layer according to engine
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
+  LRNParameter_Engine engine = param.lrn_param().engine();
+
+  if (engine == LRNParameter_Engine_DEFAULT) {
+#ifdef USE_CUDNN
+    engine = LRNParameter_Engine_CUDNN;
+#else
+    engine = LRNParameter_Engine_CAFFE;
+#endif
+  }
+
+  if (engine == LRNParameter_Engine_CAFFE) {
+    return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
+#ifdef USE_CUDNN
+  } else if (engine == LRNParameter_Engine_CUDNN) {
+    LRNParameter lrn_param = param.lrn_param();
+
+    if (lrn_param.norm_region() ==LRNParameter_NormRegion_WITHIN_CHANNEL) {
+      return shared_ptr<Layer<Dtype> >(new CuDNNLCNLayer<Dtype>(param));
+    } else {
+      // local size is too big to be handled through cuDNN
+      if (param.lrn_param().local_size() > CUDNN_LRN_MAX_N) {
+        return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
+      } else {
+        return shared_ptr<Layer<Dtype> >(new CuDNNLRNLayer<Dtype>(param));
+      }
+    }
+#endif
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
+}
+
+REGISTER_LAYER_CREATOR(LRN, GetLRNLayer);
+
 // Get relu layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
index c4040cd..0c0a932 100644 (file)
@@ -11,23 +11,43 @@ namespace caffe {
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  out_max_val_ = this->layer_param_.argmax_param().out_max_val();
-  top_k_ = this->layer_param_.argmax_param().top_k();
-  CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
-  CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
-      << "top_k must be less than or equal to the number of classes.";
+  const ArgMaxParameter& argmax_param = this->layer_param_.argmax_param();
+  out_max_val_ = argmax_param.out_max_val();
+  top_k_ = argmax_param.top_k();
+  has_axis_ = argmax_param.has_axis();
+  CHECK_GE(top_k_, 1) << "top k must not be less than 1.";
+  if (has_axis_) {
+    axis_ = bottom[0]->CanonicalAxisIndex(argmax_param.axis());
+    CHECK_GE(axis_, 0) << "axis must not be less than 0.";
+    CHECK_LE(axis_, bottom[0]->num_axes()) <<
+      "axis must be less than or equal to the number of axis.";
+    CHECK_LE(top_k_, bottom[0]->shape(axis_))
+      << "top_k must be less than or equal to the dimension of the axis.";
+  } else {
+    CHECK_LE(top_k_, bottom[0]->count(1))
+      << "top_k must be less than or equal to"
+        " the dimension of the flattened bottom blob per instance.";
+  }
 }
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  if (out_max_val_) {
-    // Produces max_ind and max_val
-    top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
+  std::vector<int> shape(bottom[0]->num_axes(), 1);
+  if (has_axis_) {
+    // Produces max_ind or max_val per axis
+    shape = bottom[0]->shape();
+    shape[axis_] = top_k_;
   } else {
-    // Produces only max_ind
-    top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1);
+    shape[0] = bottom[0]->shape(0);
+    // Produces max_ind
+    shape[2] = top_k_;
+    if (out_max_val_) {
+      // Produces max_ind and max_val
+      shape[1] = 2;
+    }
   }
+  top[0]->Reshape(shape);
 }
 
 template <typename Dtype>
@@ -35,23 +55,40 @@ void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int dim, axis_dist;
+  if (has_axis_) {
+    dim = bottom[0]->shape(axis_);
+    // Distance between values of axis in blob
+    axis_dist = bottom[0]->count(axis_) / dim;
+  } else {
+    dim = bottom[0]->count(1);
+    axis_dist = 1;
+  }
+  int num = bottom[0]->count() / dim;
+  std::vector<std::pair<Dtype, int> > bottom_data_vector(dim);
   for (int i = 0; i < num; ++i) {
-    std::vector<std::pair<Dtype, int> > bottom_data_vector;
     for (int j = 0; j < dim; ++j) {
-      bottom_data_vector.push_back(
-          std::make_pair(bottom_data[i * dim + j], j));
+      bottom_data_vector[j] = std::make_pair(
+        bottom_data[(i / axis_dist * dim + j) * axis_dist + i % axis_dist], j);
     }
     std::partial_sort(
         bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
         bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
     for (int j = 0; j < top_k_; ++j) {
-      top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
-    }
-    if (out_max_val_) {
-      for (int j = 0; j < top_k_; ++j) {
-        top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first;
+      if (out_max_val_) {
+        if (has_axis_) {
+          // Produces max_val per axis
+          top_data[(i / axis_dist * top_k_ + j) * axis_dist + i % axis_dist]
+            = bottom_data_vector[j].first;
+        } else {
+          // Produces max_ind and max_val
+          top_data[2 * i * top_k_ + j] = bottom_data_vector[j].second;
+          top_data[2 * i * top_k_ + top_k_ + j] = bottom_data_vector[j].first;
+        }
+      } else {
+        // Produces max_ind per axis
+        top_data[(i / axis_dist * top_k_ + j) * axis_dist + i % axis_dist]
+          = bottom_data_vector[j].second;
       }
     }
   }
diff --git a/src/caffe/layers/batch_reindex_layer.cpp b/src/caffe/layers/batch_reindex_layer.cpp
new file mode 100644 (file)
index 0000000..3bf757c
--- /dev/null
@@ -0,0 +1,79 @@
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+                                       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(1, bottom[1]->num_axes());
+  vector<int> newshape;
+  newshape.push_back(bottom[1]->shape(0));
+  for (int i = 1; i < bottom[0]->shape().size(); ++i) {
+    newshape.push_back(bottom[0]->shape()[i]);
+  }
+  top[0]->Reshape(newshape);
+}
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::check_batch_reindex(int initial_num,
+                                                   int final_num,
+                                                   const Dtype* ridx_data) {
+  for (int i = 0; i < final_num; ++i) {
+    CHECK_GE(ridx_data[i], 0)
+        << "Index specified for reindex layer was negative.";
+    CHECK_LT(ridx_data[i], initial_num)
+        << "Index specified for reindex layer was greater than batch size.";
+  }
+}
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                                           const vector<Blob<Dtype>*>& top) {
+  check_batch_reindex(bottom[0]->shape(0), bottom[1]->count(),
+                      bottom[1]->cpu_data());
+  if (top[0]->count() == 0) {
+    return;
+  }
+  int inner_dim = bottom[0]->count() / bottom[0]->shape(0);
+  const Dtype* in = bottom[0]->cpu_data();
+  const Dtype* permut = bottom[1]->cpu_data();
+  Dtype* out = top[0]->mutable_cpu_data();
+  for (int index = 0; index < top[0]->count(); ++index) {
+    int n = index / (inner_dim);
+    int in_n = static_cast<int>(permut[n]);
+    out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
+  }
+}
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::Backward_cpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backprop to index.";
+  if (!propagate_down[0]) {
+    return;
+  }
+  int inner_dim = bottom[0]->count() / bottom[0]->shape(0);
+  Dtype* bot_diff = bottom[0]->mutable_cpu_diff();
+  const Dtype* permut = bottom[1]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  caffe_set(bottom[0]->count(), Dtype(0), bot_diff);
+  for (int index = 0; index < top[0]->count(); ++index) {
+    int n = index / (inner_dim);
+    int in_n = static_cast<int>(permut[n]);
+    bot_diff[in_n * (inner_dim) + index % (inner_dim)] += top_diff[index];
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(BatchReindexLayer);
+#endif
+
+INSTANTIATE_CLASS(BatchReindexLayer);
+REGISTER_LAYER_CLASS(BatchReindex);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/batch_reindex_layer.cu b/src/caffe/layers/batch_reindex_layer.cu
new file mode 100644 (file)
index 0000000..c418cab
--- /dev/null
@@ -0,0 +1,107 @@
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+__global__ void BRForward(const int count, const int inner_dim, const Dtype* in,
+                          const Dtype* permut, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int n = index / (inner_dim);
+    int in_n = static_cast<int>(permut[n]);
+    out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
+  }
+}
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                                           const vector<Blob<Dtype>*>& top) {
+  check_batch_reindex(bottom[0]->shape(0), bottom[1]->count(),
+                      bottom[1]->cpu_data());
+  if (top[0]->count() == 0) {
+    return;
+  }
+  int threads = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  BRForward<Dtype> <<<CAFFE_GET_BLOCKS(threads), CAFFE_CUDA_NUM_THREADS>>>(
+      top[0]->count(), bottom[0]->count() / bottom[0]->shape(0),
+      bottom[0]->gpu_data(), bottom[1]->gpu_data(), top[0]->mutable_gpu_data());
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template<typename Dtype>
+__global__ void BRBackward(const int count, const int inner_dim,
+                           const Dtype* in, const Dtype* top_indexes,
+                           const Dtype* begins, const Dtype* counts,
+                           Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int n = index / (inner_dim);
+    out[index] = 0;
+    int lower = static_cast<int>(begins[n]);
+    int upper = lower + static_cast<int>(counts[n]);
+    for (int i = lower; i < upper; ++i) {
+      int in_n = static_cast<int>(top_indexes[i]);
+      out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
+    }
+  }
+}
+
+template<typename Dtype>
+void BatchReindexLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backprop to index.";
+  if (!propagate_down[0]) {
+    return;
+  }
+
+  vector<std::pair<int, int> > mapping;
+  const Dtype* perm = bottom[1]->cpu_data();
+  for (int i = 0; i < bottom[1]->count(); ++i) {
+    mapping.push_back(pair<int, int>(static_cast<int>(perm[i]), i));
+  }
+  std::sort(mapping.begin(), mapping.end(), pair_sort_first());
+
+  // Each element of the bottom diff is potentially the sum of many top diffs.
+  // However, we'd like each CUDA thread to handle exactly one output.  Hence,
+  // we first pre-compute a list of lists of indices that need to be summed for
+  // each output. `top_indexes` holds the data of this list of lists.  The
+  // k'th element of `begins` points to the location in `top_indexes` where the
+  // list for the k'th example begin, and the k'th element of `counts` is the
+  // length of that list.
+  vector<int> shape;
+  shape.push_back(bottom[1]->count());
+  Blob<Dtype> top_indexes(shape);
+  shape[0] = bottom[0]->shape(0);
+  Blob<Dtype> counts(shape);
+  Blob<Dtype> begins(shape);
+  Dtype* t_i_data = top_indexes.mutable_cpu_data();
+  Dtype* c_data = counts.mutable_cpu_data();
+  Dtype* b_data = begins.mutable_cpu_data();
+  caffe_set(begins.count(), Dtype(-1), b_data);
+  caffe_set(counts.count(), Dtype(0), c_data);
+  for (int i = 0; i < mapping.size(); ++i) {
+    t_i_data[i] = mapping[i].second;
+    if (b_data[mapping[i].first] == -1) {
+      b_data[mapping[i].first] = i;
+    }
+    c_data[mapping[i].first] += 1;
+  }
+
+  int threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  BRBackward<Dtype> <<<CAFFE_GET_BLOCKS(threads), CAFFE_CUDA_NUM_THREADS>>>(
+      bottom[0]->count(), bottom[0]->count() / bottom[0]->shape(0),
+      top[0]->gpu_diff(), top_indexes.gpu_data(), begins.gpu_data(),
+      counts.gpu_data(), bottom[0]->mutable_gpu_diff());
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(BatchReindexLayer);
+
+}  // namespace caffe
index 3514fe2..d7b1e0d 100644 (file)
@@ -1,4 +1,5 @@
 #ifdef USE_CUDNN
+#include <algorithm>
 #include <vector>
 
 #include "caffe/filler.hpp"
@@ -24,13 +25,38 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   // Initialize CUDA streams and cuDNN.
   stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
   handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  // Initialize algorithm arrays
+  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
+  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
+  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
+
+  // initialize size arrays
+  workspace_fwd_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
+
+  // workspace data
   workspaceSizeInBytes = 0;
-  workspace = NULL;
+  workspaceData = NULL;
+  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  for (size_t i = 0; i < bottom.size(); ++i) {
+    // initialize all to default algorithms
+    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
+    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
+    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
+    // default algorithms don't require workspace
+    workspace_fwd_sizes_[i] = 0;
+    workspace_bwd_data_sizes_[i] = 0;
+    workspace_bwd_filter_sizes_[i] = 0;
+  }
 
   for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
     CUDA_CHECK(cudaStreamCreate(&stream_[g]));
     CUDNN_CHECK(cudnnCreate(&handle_[g]));
     CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
+    workspace[g] = NULL;
   }
 
   // Set the indexing parameters.
@@ -86,6 +112,10 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   const int stride_h = stride_data[0];
   const int stride_w = stride_data[1];
 
+  // Specify workspace limit for kernels directly until we have a
+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
+  size_t workspace_limit_bytes = 8*1024*1024;
+
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
         this->num_,
@@ -98,7 +128,104 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
         this->num_output_ * this->out_spatial_dim_,
         this->out_spatial_dim_, width_out, 1);
     cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, pad_h, pad_w, stride_h, stride_w);
+        filter_desc_, pad_h, pad_w,
+        stride_h, stride_w);
+
+    // choose forward and backward algorithms + workspace(s)
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
+      bottom_descs_[i],
+      filter_desc_,
+      conv_descs_[i],
+      top_descs_[i],
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+      workspace_limit_bytes,
+      &fwd_algo_[i]));
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
+      bottom_descs_[i],
+      filter_desc_,
+      conv_descs_[i],
+      top_descs_[i],
+      fwd_algo_[i],
+      &(workspace_fwd_sizes_[i])));
+
+    // choose backward algorithm for filter
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
+          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+          workspace_limit_bytes, &bwd_filter_algo_[i]) );
+
+    // get workspace for backwards filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
+          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
+          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
+          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes, &bwd_data_algo_[i]));
+
+    // get workspace size
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
+          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
+          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
+  }
+
+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
+  size_t total_workspace_fwd = 0;
+  size_t total_workspace_bwd_data = 0;
+  size_t total_workspace_bwd_filter = 0;
+
+  for (size_t i = 0; i < bottom.size(); i++) {
+    total_workspace_fwd        = std::max(total_workspace_fwd,
+                                     workspace_fwd_sizes_[i]);
+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
+                                     workspace_bwd_data_sizes_[i]);
+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
+                                     workspace_bwd_filter_sizes_[i]);
+  }
+  // get max over all operations
+  size_t max_workspace = std::max(total_workspace_fwd,
+                             total_workspace_bwd_data);
+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // ensure all groups have enough workspace
+  size_t total_max_workspace = max_workspace *
+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
+
+  // this is the total amount of storage needed over all groups + streams
+  if (total_max_workspace > workspaceSizeInBytes) {
+    LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+    workspaceSizeInBytes = total_max_workspace;
+
+    // free the existing workspace and allocate a new (larger) one
+    cudaFree(this->workspaceData);
+
+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
+    if (err != cudaSuccess) {
+      // force zero memory path
+      for (int i = 0; i < bottom.size(); i++) {
+        workspace_fwd_sizes_[i] = 0;
+        workspace_bwd_filter_sizes_[i] = 0;
+        workspace_bwd_data_sizes_[i] = 0;
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+
+      // NULL out all workspace pointers
+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+        workspace[g] = NULL;
+      }
+      // NULL out underlying data
+      workspaceData = NULL;
+      workspaceSizeInBytes = 0;
+    }
+
+    // if we succeed in the allocation, set pointer aliases for workspaces
+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
+    }
   }
 
   // Tensor descriptor for bias.
@@ -128,8 +255,15 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
     cudnnDestroy(handle_[g]);
   }
 
+  cudaFree(workspaceData);
   delete [] stream_;
   delete [] handle_;
+  delete [] fwd_algo_;
+  delete [] bwd_filter_algo_;
+  delete [] bwd_data_algo_;
+  delete [] workspace_fwd_sizes_;
+  delete [] workspace_bwd_data_sizes_;
+  delete [] workspace_bwd_filter_sizes_;
 }
 
 INSTANTIATE_CLASS(CuDNNConvolutionLayer);
index 6911520..e88e4dd 100644 (file)
@@ -14,11 +14,6 @@ __global__ void sync_conv_groups() { }
 template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int kernel_h = kernel_shape_data[0];
-  const int kernel_w = kernel_shape_data[1];
-  const size_t workspace_limit_bytes =
-      kernel_h * kernel_w * this->channels_ * sizeof(int) + 1;
   const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -26,52 +21,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
 
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
-      cudnnConvolutionFwdAlgo_t algo;
-
-      // pick the convolution algorithm
-      // TODO(shelhamer) this should be done during reshape
-      // TODO(shelhamer) the choice of automatic or manual algorithm picking
-      // should be exposed in proto
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,  // memoryLimitInBytes,
-        &algo));
-
-      // get minimum size of the workspace needed for the desired algorithm
-      size_t workspaceSizeInBytes_temp = 0;
-
-      CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        algo,
-        &workspaceSizeInBytes_temp));
-
-      if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
-        workspaceSizeInBytes = workspaceSizeInBytes_temp;
-        // free the existing workspace and allocate a new (larger) one
-        cudaFree(this->workspace);
-        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
-        if (err != cudaSuccess) {
-          // force zero memory path
-          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-          workspace = NULL;
-          workspaceSizeInBytes = 0;
-        }
-      }
-
       // Filters.
       CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
             cudnn::dataType<Dtype>::one,
             bottom_descs_[i], bottom_data + bottom_offset_ * g,
             filter_desc_, weight + this->weight_offset_ * g,
             conv_descs_[i],
-            algo, workspace, workspaceSizeInBytes,
+            fwd_algo_[i], workspace[g], workspace_fwd_sizes_[i],
             cudnn::dataType<Dtype>::zero,
             top_descs_[i], top_data + top_offset_ * g));
 
@@ -101,10 +57,12 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     weight = this->blobs_[0]->gpu_data();
     weight_diff = this->blobs_[0]->mutable_gpu_diff();
+    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
   Dtype* bias_diff = NULL;
   if (this->bias_term_ && this->param_propagate_down_[1]) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
   }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
@@ -122,11 +80,14 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       // Gradient w.r.t. weights.
       if (this->param_propagate_down_[0]) {
         const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3(
+              handle_[1*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               bottom_descs_[i], bottom_data + bottom_offset_ * g,
               top_descs_[i],    top_diff + top_offset_ * g,
               conv_descs_[i],
+              bwd_filter_algo_[i], workspace[1*this->group_ + g],
+              workspace_bwd_filter_sizes_[i],
               cudnn::dataType<Dtype>::one,
               filter_desc_, weight_diff + this->weight_offset_ * g));
       }
@@ -137,11 +98,14 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           weight = this->blobs_[0]->gpu_data();
         }
         Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
+        CUDNN_CHECK(cudnnConvolutionBackwardData_v3(
+              handle_[2*this->group_ + g],
               cudnn::dataType<Dtype>::one,
               filter_desc_, weight + this->weight_offset_ * g,
               top_descs_[i], top_diff + top_offset_ * g,
               conv_descs_[i],
+              bwd_data_algo_[i], workspace[2*this->group_ + g],
+              workspace_bwd_data_sizes_[i],
               cudnn::dataType<Dtype>::zero,
               bottom_descs_[i], bottom_diff + bottom_offset_ * g));
       }
diff --git a/src/caffe/layers/cudnn_lcn_layer.cpp b/src/caffe/layers/cudnn_lcn_layer.cpp
new file mode 100644 (file)
index 0000000..866d810
--- /dev/null
@@ -0,0 +1,77 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNLCNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LRNLayer<Dtype>::LayerSetUp(bottom, top);
+
+  CUDNN_CHECK(cudnnCreate(&handle_));
+  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
+  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
+  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+
+  // create a LRN handle
+  handles_setup_ = true;
+
+  size_ = this->layer_param().lrn_param().local_size();
+  pre_pad_ = (size_ - 1) / 2;
+  alpha_ = this->layer_param().lrn_param().alpha();
+  beta_ = this->layer_param().lrn_param().beta();
+  k_ = this->layer_param().lrn_param().k();
+}
+
+template <typename Dtype>
+void CuDNNLCNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LRNLayer<Dtype>::Reshape(bottom, top);
+  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
+      this->channels_, this->height_, this->width_);
+  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
+      this->channels_, this->height_, this->width_);
+  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
+
+  // allocate / reallocate tempData buffers
+  size_t totalSizeInBytes = sizeof(Dtype)*bottom[0]->num()* \
+                            this->channels_*this->height_*this->width_;
+
+  if (totalSizeInBytes > tempDataSize) {
+    tempDataSize = totalSizeInBytes;
+
+    cudaFree(tempData1);
+    cudaFree(tempData2);
+
+    // allocate new buffers
+    CUDA_CHECK(cudaMalloc(&tempData1, totalSizeInBytes));
+    CUDA_CHECK(cudaMalloc(&tempData2, totalSizeInBytes));
+  }
+}
+
+template <typename Dtype>
+CuDNNLCNLayer<Dtype>::~CuDNNLCNLayer() {
+  // Check that handles have been setup before destroying.
+  if (!handles_setup_) { return; }
+
+  cudnnDestroyTensorDescriptor(bottom_desc_);
+  cudnnDestroyTensorDescriptor(top_desc_);
+
+  // destroy LRN handle
+  cudnnDestroy(handle_);
+
+  // free temp buffers
+  cudaFree(tempData1);
+  cudaFree(tempData2);
+}
+
+INSTANTIATE_CLASS(CuDNNLCNLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_lcn_layer.cu b/src/caffe/layers/cudnn_lcn_layer.cu
new file mode 100644 (file)
index 0000000..c07ade7
--- /dev/null
@@ -0,0 +1,50 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNLCNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  CUDNN_CHECK(cudnnDivisiveNormalizationForward(
+        handle_, norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+        cudnn::dataType<Dtype>::one,
+        bottom_desc_, bottom_data,
+        NULL,  // srcMeansData
+        this->tempData1, this->tempData2,
+        cudnn::dataType<Dtype>::zero,
+        top_desc_, top_data) );
+}
+
+template <typename Dtype>
+void CuDNNLCNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+  CUDNN_CHECK(cudnnDivisiveNormalizationBackward(
+        handle_, norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+        cudnn::dataType<Dtype>::one,
+        bottom_desc_, bottom_data,
+        NULL, top_diff,  // NULL - srcMeansData
+        this->tempData1, this->tempData2,
+        cudnn::dataType<Dtype>::zero,
+        bottom_desc_, bottom_diff,
+        NULL) );
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLCNLayer);
+
+}  // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cpp b/src/caffe/layers/cudnn_lrn_layer.cpp
new file mode 100644 (file)
index 0000000..6e99214
--- /dev/null
@@ -0,0 +1,57 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LRNLayer<Dtype>::LayerSetUp(bottom, top);
+
+  CUDNN_CHECK(cudnnCreate(&handle_));
+  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
+  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
+  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+
+  // create a LRN handle
+  handles_setup_ = true;
+
+  size_ = this->layer_param().lrn_param().local_size();
+  alpha_ = this->layer_param().lrn_param().alpha();
+  beta_ = this->layer_param().lrn_param().beta();
+  k_ = this->layer_param().lrn_param().k();
+}
+
+template <typename Dtype>
+void CuDNNLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LRNLayer<Dtype>::Reshape(bottom, top);
+  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
+      this->channels_, this->height_, this->width_);
+  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
+      this->channels_, this->height_, this->width_);
+  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
+}
+
+template <typename Dtype>
+CuDNNLRNLayer<Dtype>::~CuDNNLRNLayer() {
+  // Check that handles have been setup before destroying.
+  if (!handles_setup_) { return; }
+
+  cudnnDestroyTensorDescriptor(bottom_desc_);
+  cudnnDestroyTensorDescriptor(top_desc_);
+
+  // destroy LRN handle
+  cudnnDestroy(handle_);
+}
+
+INSTANTIATE_CLASS(CuDNNLRNLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cu b/src/caffe/layers/cudnn_lrn_layer.cu
new file mode 100644 (file)
index 0000000..f992303
--- /dev/null
@@ -0,0 +1,48 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void CuDNNLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  CUDNN_CHECK(cudnnLRNCrossChannelForward(
+        handle_, norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        cudnn::dataType<Dtype>::one,
+        bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        top_desc_, top_data) );
+}
+
+template <typename Dtype>
+void CuDNNLRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+  CUDNN_CHECK(cudnnLRNCrossChannelBackward(
+        handle_, norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        cudnn::dataType<Dtype>::one,
+        top_desc_, top_data,
+        top_desc_, top_diff,
+        bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        bottom_desc_, bottom_diff) );
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLRNLayer);
+
+};  // namespace caffe
+
+#endif
index 36c1ace..d18a04e 100644 (file)
@@ -254,6 +254,5 @@ STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
 #endif
 
 INSTANTIATE_CLASS(LRNLayer);
-REGISTER_LAYER_CLASS(LRN);
 
 }  // namespace caffe
index 325691b..61c2141 100644 (file)
@@ -42,29 +42,21 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   int dim = bottom[0]->count() / num;
 
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_cpu_data());
+  // subtract mean
+  caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
+      sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+      mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+      temp_.mutable_cpu_data());
+  caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);  // X-EX
 
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // compute variance using var(X) = E((X-EX)^2)
+    caffe_powx(bottom[0]->count(), top_data, Dtype(2),
+        temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
         sum_multiplier_.cpu_data(), 0.,
-        variance_.mutable_cpu_data());  // E(X^2)
-    caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
-        temp_.mutable_cpu_data());  // (EX)^2
-    caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
-        variance_.mutable_cpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
-
-    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
+        variance_.mutable_cpu_data());  // E((X-EX)^2)
 
     // normalize variance
     caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
@@ -77,16 +69,6 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           temp_.mutable_cpu_data());
 
     caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
-  } else {
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-
-    // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
-
-    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
   }
 }
 
index d86a2e7..5cbb112 100644 (file)
@@ -20,29 +20,22 @@ void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   int dim = bottom[0]->count() / num;
 
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
+  // subtract mean
+  caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
+      sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+      mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+      temp_.mutable_gpu_data());
+  caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(),
+      top_data);  // X-EX
 
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // compute variance using var(X) = E((X-EX)^2)
+    caffe_gpu_powx(bottom[0]->count(), top_data, Dtype(2),
+        temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
         sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E(X^2)
-    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-        temp_.mutable_gpu_data());  // (EX)^2
-    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-        variance_.mutable_gpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+        variance_.mutable_gpu_data());  // E((X-EX)^2)
 
     // normalize variance
     caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
@@ -55,16 +48,6 @@ void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           temp_.mutable_gpu_data());
 
     caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
   }
 }
 
index e1f2004..1225334 100644 (file)
@@ -31,10 +31,15 @@ __global__ void PReLUBackward(const int n, const int channels, const int dim,
 
 // CUDA kernel for element-wise parameter backward
 template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
+__global__ void PReLUParamBackward(const int n,
+    const int rows, const int rowPitch, const Dtype* in_diff,
     const Dtype* in_data, Dtype* out_diff) {
   CUDA_KERNEL_LOOP(index, n) {
     out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+    for ( int k = 1; k < rows; k++ ) {
+        out_diff[index] += in_diff[index + k*rowPitch]
+           * in_data[index + k*rowPitch] * (in_data[index + k*rowPitch] <= 0);
+    }
   }
 }
 
@@ -82,29 +87,24 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
     int cdim = channels * dim;
-    Dtype dsum = 0.;
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-          CAFFE_CUDA_NUM_THREADS>>>(
-          cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n),
-          backward_buff_.mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-      if (channel_shared_) {
-        Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        dsum += d;
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-            slope_diff);
-      }
-    }
+
+    // compute element-wise diff
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
+      CAFFE_CUDA_NUM_THREADS>>>(
+      cdim, bottom[0]->num(), top[0]->offset(1), top_diff ,
+      bottom_data ,
+      backward_buff_.mutable_gpu_diff());
+    CUDA_POST_KERNEL_CHECK;
     if (channel_shared_) {
+      Dtype dsum;
+      caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
+       multiplier_.gpu_data(), &dsum);
       caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+    } else {
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
+        backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+        slope_diff);
     }
   }
   // Propagate to bottom
index 4abf9ef..7e70ab4 100644 (file)
@@ -12,7 +12,7 @@ void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   for (int i = 0; i < bottom.size(); ++i) {
     if (propagate_down[i]) {
       caffe_set(bottom[i]->count(), Dtype(0),
-                bottom[i]->mutable_cpu_data());
+                bottom[i]->mutable_cpu_diff());
     }
   }
 }
index 8d044ee..34faef2 100644 (file)
@@ -18,7 +18,7 @@ void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   for (int i = 0; i < bottom.size(); ++i) {
     if (propagate_down[i]) {
       caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_data());
+                    bottom[i]->mutable_gpu_diff());
     }
   }
 }
index ebb8b5d..1ad93e6 100644 (file)
@@ -46,10 +46,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // the current NetState.
   NetParameter filtered_param;
   FilterNet(in_param, &filtered_param);
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Initializing net from parameters: " << std::endl
-              << filtered_param.DebugString();
-  }
+  LOG_IF(INFO, Caffe::root_solver())
+      << "Initializing net from parameters: " << std::endl
+      << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
   NetParameter param;
   InsertSplits(filtered_param, &param);
@@ -73,8 +72,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     const int layer_id = -1;  // inputs have fake layer ID -1
     AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
   }
-  DLOG_IF(INFO, Caffe::root_solver())
-      << "Memory required for data: " << memory_used_ * sizeof(Dtype);
   // For each layer, set up its input and output
   bottom_vecs_.resize(param.layer_size());
   top_vecs_.resize(param.layer_size());
@@ -106,9 +103,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     }
     layer_names_.push_back(layer_param.name());
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating Layer " << layer_param.name();
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Creating Layer " << layer_param.name();
     bool need_backward = false;
 
     // Figure out this layer's input and output
@@ -151,29 +147,23 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     } else {
       layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     }
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Setting up " << layer_names_[layer_id];
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Setting up " << layer_names_[layer_id];
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
       if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
         blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
       }
       blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "Top shape: "
-                  << top_vecs_[layer_id][top_id]->shape_string();
-      }
+      LOG_IF(INFO, Caffe::root_solver())
+          << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
       if (layer->loss(top_id)) {
-        if (Caffe::root_solver()) {
-          LOG(INFO) << "    with loss weight " << layer->loss(top_id);
-        }
+        LOG_IF(INFO, Caffe::root_solver())
+            << "    with loss weight " << layer->loss(top_id);
       }
       memory_used_ += top_vecs_[layer_id][top_id]->count();
     }
-    if (Caffe::root_solver()) {
-      DLOG(INFO) << "Memory required for data: "
-                 << memory_used_ * sizeof(Dtype);
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Memory required for data: " << memory_used_ * sizeof(Dtype);
     const int param_size = layer_param.param_size();
     const int num_param_blobs = layers_[layer_id]->blobs().size();
     CHECK_LE(param_size, num_param_blobs)
@@ -231,14 +221,12 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
-    if (layer_need_backward_[layer_id]) {
-      if (Caffe::root_solver()) {
+    if (Caffe::root_solver()) {
+      if (layer_need_backward_[layer_id]) {
         LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-      }
-    } else {
-      if (Caffe::root_solver()) {
+      } else {
         LOG(INFO) << layer_names_[layer_id]
-                  << " does not need backward computation.";
+            << " does not need backward computation.";
       }
     }
     for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
@@ -279,9 +267,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // In the end, all remaining blobs are considered output blobs.
   for (set<string>::iterator it = available_blobs.begin();
       it != available_blobs.end(); ++it) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "This network produces output " << *it;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "This network produces output " << *it;
     net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
     net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
   }
@@ -293,10 +280,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   }
   ShareWeights();
   debug_info_ = param.debug_info();
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "Network initialization done.";
-    LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-  }
+  LOG_IF(INFO, Caffe::root_solver()) << "Network initialization done.";
 }
 
 template <typename Dtype>
@@ -335,33 +319,30 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
   // Check whether the rule is broken due to phase.
   if (rule.has_phase()) {
       if (rule.phase() != state.phase()) {
-        if (Caffe::root_solver()) {
-          LOG(INFO) << "The NetState phase (" << state.phase()
-                    << ") differed from the phase (" << rule.phase()
-                    << ") specified by a rule in layer " << layer_name;
-        }
+        LOG_IF(INFO, Caffe::root_solver())
+            << "The NetState phase (" << state.phase()
+            << ") differed from the phase (" << rule.phase()
+            << ") specified by a rule in layer " << layer_name;
         return false;
       }
   }
   // Check whether the rule is broken due to min level.
   if (rule.has_min_level()) {
     if (state.level() < rule.min_level()) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState level (" << state.level()
-                  << ") is above the min_level (" << rule.min_level()
-                  << ") specified by a rule in layer " << layer_name;
-      }
+      LOG_IF(INFO, Caffe::root_solver())
+          << "The NetState level (" << state.level()
+          << ") is above the min_level (" << rule.min_level()
+          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
   // Check whether the rule is broken due to max level.
   if (rule.has_max_level()) {
     if (state.level() > rule.max_level()) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState level (" << state.level()
-                  << ") is above the max_level (" << rule.max_level()
-                  << ") specified by a rule in layer " << layer_name;
-      }
+      LOG_IF(INFO, Caffe::root_solver())
+          << "The NetState level (" << state.level()
+          << ") is above the max_level (" << rule.max_level()
+          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -374,10 +355,9 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (!has_stage) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                  << "' specified by a rule in layer " << layer_name;
-      }
+      LOG_IF(INFO, Caffe::root_solver())
+          << "The NetState did not contain stage '" << rule.stage(i)
+          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -390,10 +370,9 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (has_stage) {
-      if (Caffe::root_solver()) {
-        LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                  << "' specified by a rule in layer " << layer_name;
-      }
+      LOG_IF(INFO, Caffe::root_solver())
+          << "The NetState contained a not_stage '" << rule.not_stage(i)
+          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -415,9 +394,8 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
   if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
       blob_name == layer_param->bottom(top_id)) {
     // In-place computation
-    if (Caffe::root_solver()) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << layer_param->name() << " -> " << blob_name << " (in-place)";
     top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
     top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
   } else if (blob_name_to_idx &&
@@ -473,9 +451,8 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
                << layer_param.name() << "', bottom index " << bottom_id << ")";
   }
   const int blob_id = (*blob_name_to_idx)[blob_name];
-  if (Caffe::root_solver()) {
-    LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
-  }
+  LOG_IF(INFO, Caffe::root_solver())
+      << layer_names_[layer_id] << " <- " << blob_name;
   bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
   bottom_id_vecs_[layer_id].push_back(blob_id);
   available_blobs->erase(blob_name);
@@ -672,10 +649,9 @@ void Net<Dtype>::InputDebugInfo(const int input_id) {
   const Blob<Dtype>& blob = *net_input_blobs_[input_id];
   const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
   const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  if (Caffe::root_solver()) {
-    LOG(INFO) << "    [Forward] "
-              << "Input " << blob_name << " data: " << data_abs_val_mean;
-  }
+  LOG_IF(INFO, Caffe::root_solver())
+      << "    [Forward] "
+      << "Input " << blob_name << " data: " << data_abs_val_mean;
 }
 
 template <typename Dtype>
@@ -684,12 +660,11 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
     const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Forward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", top blob " << blob_name
-                << " data: " << data_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Forward] "
+        << "Layer " << layer_names_[layer_id]
+        << ", top blob " << blob_name
+        << " data: " << data_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
@@ -697,12 +672,11 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const int net_param_id = param_id_vecs_[layer_id][param_id];
     const string& blob_name = param_display_names_[net_param_id];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Forward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", param blob " << blob_name
-                << " data: " << data_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Forward] "
+        << "Layer " << layer_names_[layer_id]
+        << ", param blob " << blob_name
+        << " data: " << data_abs_val_mean;
   }
 }
 
@@ -714,24 +688,22 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *bottom_vec[bottom_id];
     const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Backward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", bottom blob " << blob_name
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Backward] "
+        << "Layer " << layer_names_[layer_id]
+        << ", bottom blob " << blob_name
+        << " diff: " << diff_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
     if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Backward] "
-                << "Layer " << layer_names_[layer_id]
-                << ", param blob " << param_id
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Backward] "
+        << "Layer " << layer_names_[layer_id]
+        << ", param blob " << param_id
+        << " diff: " << diff_abs_val_mean;
   }
 }
 
@@ -744,22 +716,20 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
   if (param_owner < 0) {
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Update] Layer " << layer_name
-                << ", param " << param_display_name
-                << " data: " << data_abs_val_mean
-                << "; diff: " << diff_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Update] Layer " << layer_name
+        << ", param " << param_display_name
+        << " data: " << data_abs_val_mean
+        << "; diff: " << diff_abs_val_mean;
   } else {
     const string& owner_layer_name =
         layer_names_[param_layer_indices_[param_owner].first];
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "    [Update] Layer " << layer_name
-                << ", param blob " << param_display_name
-                << " (owned by layer " << owner_layer_name << ", " << "param "
-                << param_display_names_[param_owners_[param_id]] << ")"
-                << " diff: " << diff_abs_val_mean;
-    }
+    LOG_IF(INFO, Caffe::root_solver())
+        << "    [Update] Layer " << layer_name
+        << ", param blob " << param_display_name
+        << " (owned by layer " << owner_layer_name << ", " << "param "
+        << param_display_names_[param_owners_[param_id]] << ")"
+        << " diff: " << diff_abs_val_mean;
   }
 }
 
index f52c941..76c869c 100644 (file)
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 40 (last added: momentum2)
+// SolverParameter next available ID: 41 (last added: type)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -209,16 +209,9 @@ message SolverParameter {
   // (and by default) initialize using a seed derived from the system clock.
   optional int64 random_seed = 20 [default = -1];
 
-  // Solver type
-  enum SolverType {
-    SGD = 0;
-    NESTEROV = 1;
-    ADAGRAD = 2;
-    RMSPROP = 3;
-    ADADELTA = 4;
-    ADAM = 5;
-  }
-  optional SolverType solver_type = 30 [default = SGD];
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
   // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
   optional float delta = 31 [default = 1e-8];
   // parameters for the Adam solver
@@ -234,6 +227,18 @@ message SolverParameter {
 
   // If false, don't save a snapshot after training finishes.
   optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
 }
 
 // A message that stores the solver snapshots
@@ -443,6 +448,11 @@ message ArgMaxParameter {
   // If true produce pairs (argmax, maxval)
   optional bool out_max_val = 1 [default = false];
   optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
 }
 
 message ConcatParameter {
@@ -721,6 +731,12 @@ message LRNParameter {
   }
   optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
   optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
 }
 
 message MemoryDataParameter {
index 12c13dd..d3bc736 100644 (file)
@@ -1,18 +1,11 @@
 #include <cstdio>
 
-#include <algorithm>
 #include <string>
 #include <vector>
 
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/net.hpp"
-#include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
 #include "caffe/util/hdf5.hpp"
 #include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
 namespace caffe {
@@ -43,7 +36,7 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
     : net_(), callbacks_(), root_solver_(root_solver),
       requested_early_exit_(false) {
   SolverParameter param;
-  ReadProtoFromTextFileOrDie(param_file, &param);
+  ReadSolverParamsFromTextFileOrDie(param_file, &param);
   Init(param);
 }
 
@@ -492,810 +485,6 @@ void Solver<Dtype>::Restore(const char* state_file) {
   }
 }
 
-// Return the current learning rate. The currently implemented learning rate
-// policies are as follows:
-//    - fixed: always return base_lr.
-//    - step: return base_lr * gamma ^ (floor(iter / step))
-//    - exp: return base_lr * gamma ^ iter
-//    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
-//    - multistep: similar to step but it allows non uniform steps defined by
-//      stepvalue
-//    - poly: the effective learning rate follows a polynomial decay, to be
-//      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
-//    - sigmoid: the effective learning rate follows a sigmod decay
-//      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
-//
-// where base_lr, max_iter, gamma, step, stepvalue and power are defined
-// in the solver parameter protocol buffer, and iter is the current iteration.
-template <typename Dtype>
-Dtype SGDSolver<Dtype>::GetLearningRate() {
-  Dtype rate;
-  const string& lr_policy = this->param_.lr_policy();
-  if (lr_policy == "fixed") {
-    rate = this->param_.base_lr();
-  } else if (lr_policy == "step") {
-    this->current_step_ = this->iter_ / this->param_.stepsize();
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "exp") {
-    rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
-  } else if (lr_policy == "inv") {
-    rate = this->param_.base_lr() *
-        pow(Dtype(1) + this->param_.gamma() * this->iter_,
-            - this->param_.power());
-  } else if (lr_policy == "multistep") {
-    if (this->current_step_ < this->param_.stepvalue_size() &&
-          this->iter_ >= this->param_.stepvalue(this->current_step_)) {
-      this->current_step_++;
-      LOG(INFO) << "MultiStep Status: Iteration " <<
-      this->iter_ << ", step = " << this->current_step_;
-    }
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "poly") {
-    rate = this->param_.base_lr() * pow(Dtype(1.) -
-        (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-        this->param_.power());
-  } else if (lr_policy == "sigmoid") {
-    rate = this->param_.base_lr() * (Dtype(1.) /
-        (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-          Dtype(this->param_.stepsize())))));
-  } else {
-    LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
-  }
-  return rate;
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::PreSolve() {
-  // Initialize the history
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  history_.clear();
-  update_.clear();
-  temp_.clear();
-  for (int i = 0; i < net_params.size(); ++i) {
-    const vector<int>& shape = net_params[i]->shape();
-    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ClipGradients() {
-  const Dtype clip_gradients = this->param_.clip_gradients();
-  if (clip_gradients < 0) { return; }
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  Dtype sumsq_diff = 0;
-  for (int i = 0; i < net_params.size(); ++i) {
-    sumsq_diff += net_params[i]->sumsq_diff();
-  }
-  const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-  if (l2norm_diff > clip_gradients) {
-    Dtype scale_factor = clip_gradients / l2norm_diff;
-    LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-        << l2norm_diff << " > " << clip_gradients << ") "
-        << "by scale factor " << scale_factor;
-    for (int i = 0; i < net_params.size(); ++i) {
-      net_params[i]->scale_diff(scale_factor);
-    }
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ApplyUpdate() {
-  CHECK(Caffe::root_solver());
-  Dtype rate = GetLearningRate();
-  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
-  }
-  ClipGradients();
-  for (int param_id = 0; param_id < this->net_->learnable_params().size();
-       ++param_id) {
-    Normalize(param_id);
-    Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
-  }
-  this->net_->Update();
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::Normalize(int param_id) {
-  if (this->param_.iter_size() == 1) { return; }
-  // Scale gradient to counterbalance accumulation.
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::Regularize(int param_id) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_weight_decay =
-      this->net_->params_weight_decay();
-  Dtype weight_decay = this->param_.weight_decay();
-  string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_cpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->cpu_data(),
-            temp_[param_id]->mutable_cpu_data());
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_gpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->gpu_data(),
-            temp_[param_id]->mutable_gpu_data());
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  // Compute the update to history, then copy it to the parameter diff.
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              history_[param_id]->mutable_cpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              history_[param_id]->mutable_gpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverState(const string& model_filename) {
-  switch (this->param_.snapshot_format()) {
-    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
-      SnapshotSolverStateToBinaryProto(model_filename);
-      break;
-    case caffe::SolverParameter_SnapshotFormat_HDF5:
-      SnapshotSolverStateToHDF5(model_filename);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported snapshot format.";
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
-    const string& model_filename) {
-  SolverState state;
-  state.set_iter(this->iter_);
-  state.set_learned_net(model_filename);
-  state.set_current_step(this->current_step_);
-  state.clear_history();
-  for (int i = 0; i < history_.size(); ++i) {
-    // Add history
-    BlobProto* history_blob = state.add_history();
-    history_[i]->ToProto(history_blob);
-  }
-  string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
-  LOG(INFO)
-    << "Snapshotting solver state to binary proto file " << snapshot_filename;
-  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
-    const string& model_filename) {
-  string snapshot_filename =
-      Solver<Dtype>::SnapshotFilename(".solverstate.h5");
-  LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
-  hid_t file_hid = H5Fcreate(snapshot_filename.c_str(), H5F_ACC_TRUNC,
-      H5P_DEFAULT, H5P_DEFAULT);
-  CHECK_GE(file_hid, 0)
-      << "Couldn't open " << snapshot_filename << " to save solver state.";
-  hdf5_save_int(file_hid, "iter", this->iter_);
-  hdf5_save_string(file_hid, "learned_net", model_filename);
-  hdf5_save_int(file_hid, "current_step", this->current_step_);
-  hid_t history_hid = H5Gcreate2(file_hid, "history", H5P_DEFAULT, H5P_DEFAULT,
-      H5P_DEFAULT);
-  CHECK_GE(history_hid, 0)
-      << "Error saving solver state to " << snapshot_filename << ".";
-  for (int i = 0; i < history_.size(); ++i) {
-    ostringstream oss;
-    oss << i;
-    hdf5_save_nd_dataset<Dtype>(history_hid, oss.str(), *history_[i]);
-  }
-  H5Gclose(history_hid);
-  H5Fclose(file_hid);
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
-    const string& state_file) {
-  SolverState state;
-  ReadProtoFromBinaryFile(state_file, &state);
-  this->iter_ = state.iter();
-  if (state.has_learned_net()) {
-    NetParameter net_param;
-    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
-    this->net_->CopyTrainedLayersFrom(net_param);
-  }
-  this->current_step_ = state.current_step();
-  CHECK_EQ(state.history_size(), history_.size())
-      << "Incorrect length of history blobs.";
-  LOG(INFO) << "SGDSolver: restoring history";
-  for (int i = 0; i < history_.size(); ++i) {
-    history_[i]->FromProto(state.history(i));
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
-  hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
-  CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
-  this->iter_ = hdf5_load_int(file_hid, "iter");
-  if (H5LTfind_dataset(file_hid, "learned_net")) {
-    string learned_net = hdf5_load_string(file_hid, "learned_net");
-    this->net_->CopyTrainedLayersFrom(learned_net);
-  }
-  this->current_step_ = hdf5_load_int(file_hid, "current_step");
-  hid_t history_hid = H5Gopen2(file_hid, "history", H5P_DEFAULT);
-  CHECK_GE(history_hid, 0) << "Error reading history from " << state_file;
-  int state_history_size = hdf5_get_num_links(history_hid);
-  CHECK_EQ(state_history_size, history_.size())
-      << "Incorrect length of history blobs.";
-  for (int i = 0; i < history_.size(); ++i) {
-    ostringstream oss;
-    oss << i;
-    hdf5_load_nd_dataset<Dtype>(history_hid, oss.str().c_str(), 0,
-                                kMaxBlobAxes, history_[i].get());
-  }
-  H5Gclose(history_hid);
-  H5Fclose(file_hid);
-}
-
-template <typename Dtype>
-void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              this->history_[param_id]->mutable_cpu_data());
-
-    // compute update: step back then over step
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->cpu_data(), -momentum,
-        this->update_[param_id]->mutable_cpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              this->history_[param_id]->mutable_gpu_data());
-
-    // compute update: step back then over step
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->gpu_data(), -momentum,
-        this->update_[param_id]->mutable_gpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype delta = this->param_.delta();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_add(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->history_[param_id]->mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-              this->history_[param_id]->cpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-              net_params[param_id]->cpu_diff(),
-              this->update_[param_id]->cpu_data(),
-              this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->history_[param_id]->mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
-              this->history_[param_id]->gpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-              net_params[param_id]->gpu_diff(),
-              this->update_[param_id]->gpu_data(),
-              this->update_[param_id]->mutable_gpu_data());
-
-    // scale and copy
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-
-  // get the learning rate
-  Dtype delta = this->param_.delta();
-  Dtype rms_decay = this->param_.rms_decay();
-  Dtype local_rate = rate * net_params_lr[param_id];
-
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id] -> count(),
-        Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
-        rms_decay, this->history_[param_id]-> mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-        delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  case Caffe::GPU:
-#ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id] -> count(),
-        Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
-        rms_decay, this->history_[param_id]-> mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-        delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
-  // Add the extra history entries for AdaDelta after those from
-  // SGDSolver::PreSolve
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  for (int i = 0; i < net_params.size(); ++i) {
-        const vector<int>& shape = net_params[i]->shape();
-        this->history_.push_back(
-                shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-template <typename Dtype>
-void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype delta = this->param_.delta();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  size_t update_history_offset = net_params.size();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history of gradients
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->cpu_data(), momentum,
-        this->history_[param_id]->mutable_cpu_data());
-
-    // add delta to history to guard against dividing by zero later
-    caffe_set(net_params[param_id]->count(), delta,
-        this->temp_[param_id]->mutable_cpu_data());
-
-    caffe_add(net_params[param_id]->count(),
-        this->temp_[param_id]->cpu_data(),
-        this->history_[update_history_offset + param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add(net_params[param_id]->count(),
-        this->temp_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->temp_[param_id]->mutable_cpu_data());
-
-    // divide history of updates by history of gradients
-    caffe_div(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->temp_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // jointly compute the RMS of both for update and gradient history
-    caffe_powx(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // compute the update
-    caffe_mul(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-
-    // compute square of update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history of updates
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->cpu_data(), momentum,
-        this->history_[update_history_offset + param_id]->mutable_cpu_data());
-
-    // apply learning rate
-    caffe_cpu_scale(net_params[param_id]->count(), local_rate,
-        net_params[param_id]->cpu_diff(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history of gradients
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->gpu_data(), momentum,
-        this->history_[param_id]->mutable_gpu_data());
-
-    // add delta to history to guard against dividing by zero later
-    caffe_gpu_set(net_params[param_id]->count(), delta,
-        this->temp_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->temp_[param_id]->gpu_data(),
-        this->history_[update_history_offset + param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->temp_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->temp_[param_id]->mutable_gpu_data());
-
-    // divide history of updates by history of gradients
-    caffe_gpu_div(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->temp_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // jointly compute the RMS of both for update and gradient history
-    caffe_gpu_powx(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // compute the update and copy to net_diff
-    caffe_gpu_mul(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
-
-    // compute square of update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history of updates
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->gpu_data(), momentum,
-        this->history_[update_history_offset + param_id]->mutable_gpu_data());
-
-    // apply learning rate
-    caffe_gpu_scale(net_params[param_id]->count(), local_rate,
-        net_params[param_id]->gpu_diff(),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void AdamSolver<Dtype>::AdamPreSolve() {
-  // Add the extra history entries for Adam after those from
-  // SGDSolver::PreSolve
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  for (int i = 0; i < net_params.size(); ++i) {
-    const vector<int>& shape = net_params[i]->shape();
-    this->history_.push_back(
-            shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-template <typename Dtype>
-void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  const Dtype beta1 = this->param_.momentum();
-  const Dtype beta2 = this->param_.momentum2();
-
-  // we create aliases for convenience
-  size_t update_history_offset = net_params.size();
-  Blob<Dtype>* val_m = this->history_[param_id].get();
-  Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
-  Blob<Dtype>* val_t = this->temp_[param_id].get();
-
-  const int t = this->iter_  + 1;
-  const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) /
-      (Dtype(1.) - pow(beta1, t));
-  const int N = net_params[param_id]->count();
-  const Dtype eps_hat = this->param_.delta();
-
-  switch (Caffe::mode()) {
-    case Caffe::CPU: {
-    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
-    caffe_cpu_axpby(N, Dtype(1)-beta1,
-        net_params[param_id]->cpu_diff(), beta1,
-        val_m->mutable_cpu_data());
-
-    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
-    caffe_mul(N,
-        net_params[param_id]->cpu_diff(),
-        net_params[param_id]->cpu_diff(),
-    val_t->mutable_cpu_data());
-    caffe_cpu_axpby(N, Dtype(1)-beta2,
-        val_t->cpu_data(), beta2,
-        val_v->mutable_cpu_data());
-
-    // set update
-    caffe_powx(N,
-        val_v->cpu_data(), Dtype(0.5),
-        val_t->mutable_cpu_data());
-    caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data());
-    caffe_div(N,
-        val_m->cpu_data(),
-        val_t->cpu_data(),
-        val_t->mutable_cpu_data());
-
-    caffe_cpu_scale(N, local_rate*correction,
-        val_t->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
-    caffe_gpu_axpby(N, Dtype(1)-beta1,
-        net_params[param_id]->gpu_diff(), beta1,
-        val_m->mutable_gpu_data());
-
-    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
-    caffe_gpu_mul(N,
-        net_params[param_id]->gpu_diff(),
-        net_params[param_id]->gpu_diff(),
-        val_t->mutable_gpu_data());
-    caffe_gpu_axpby(N, Dtype(1)-beta2,
-        val_t->gpu_data(), beta2,
-        val_v->mutable_gpu_data());
-
-    // set update
-    caffe_gpu_powx(N,
-        val_v->gpu_data(), Dtype(0.5),
-        val_t->mutable_gpu_data());
-    caffe_gpu_add_scalar(N, eps_hat,
-        val_t->mutable_gpu_data());
-    caffe_gpu_div(N,
-        val_m->gpu_data(),
-        val_t->gpu_data(),
-        val_t->mutable_gpu_data());
-
-    caffe_gpu_scale(N, local_rate*correction,
-        val_t->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
 INSTANTIATE_CLASS(Solver);
-INSTANTIATE_CLASS(SGDSolver);
-INSTANTIATE_CLASS(NesterovSolver);
-INSTANTIATE_CLASS(AdaGradSolver);
-INSTANTIATE_CLASS(RMSPropSolver);
-INSTANTIATE_CLASS(AdaDeltaSolver);
-INSTANTIATE_CLASS(AdamSolver);
 
 }  // namespace caffe
diff --git a/src/caffe/solvers/adadelta_solver.cpp b/src/caffe/solvers/adadelta_solver.cpp
new file mode 100644 (file)
index 0000000..a37899e
--- /dev/null
@@ -0,0 +1,156 @@
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
+  // Add the extra history entries for AdaDelta after those from
+  // SGDSolver::PreSolve
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  for (int i = 0; i < net_params.size(); ++i) {
+        const vector<int>& shape = net_params[i]->shape();
+        this->history_.push_back(
+                shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  }
+}
+
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype delta = this->param_.delta();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  size_t update_history_offset = net_params.size();
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history of gradients
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->cpu_data(), momentum,
+        this->history_[param_id]->mutable_cpu_data());
+
+    // add delta to history to guard against dividing by zero later
+    caffe_set(net_params[param_id]->count(), delta,
+        this->temp_[param_id]->mutable_cpu_data());
+
+    caffe_add(net_params[param_id]->count(),
+        this->temp_[param_id]->cpu_data(),
+        this->history_[update_history_offset + param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add(net_params[param_id]->count(),
+        this->temp_[param_id]->cpu_data(),
+        this->history_[param_id]->cpu_data(),
+        this->temp_[param_id]->mutable_cpu_data());
+
+    // divide history of updates by history of gradients
+    caffe_div(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        this->temp_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // jointly compute the RMS of both for update and gradient history
+    caffe_powx(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // compute the update
+    caffe_mul(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(),
+        this->update_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+
+    // compute square of update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history of updates
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->cpu_data(), momentum,
+        this->history_[update_history_offset + param_id]->mutable_cpu_data());
+
+    // apply learning rate
+    caffe_cpu_scale(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->cpu_diff(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history of gradients
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->gpu_data(), momentum,
+        this->history_[param_id]->mutable_gpu_data());
+
+    // add delta to history to guard against dividing by zero later
+    caffe_gpu_set(net_params[param_id]->count(), delta,
+        this->temp_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->temp_[param_id]->gpu_data(),
+        this->history_[update_history_offset + param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->temp_[param_id]->gpu_data(),
+        this->history_[param_id]->gpu_data(),
+        this->temp_[param_id]->mutable_gpu_data());
+
+    // divide history of updates by history of gradients
+    caffe_gpu_div(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        this->temp_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // jointly compute the RMS of both for update and gradient history
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // compute the update and copy to net_diff
+    caffe_gpu_mul(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(),
+        this->update_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+
+    // compute square of update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history of updates
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->gpu_data(), momentum,
+        this->history_[update_history_offset + param_id]->mutable_gpu_data());
+
+    // apply learning rate
+    caffe_gpu_scale(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->gpu_diff(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+INSTANTIATE_CLASS(AdaDeltaSolver);
+REGISTER_SOLVER_CLASS(AdaDelta);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
new file mode 100644 (file)
index 0000000..5e40632
--- /dev/null
@@ -0,0 +1,89 @@
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  CHECK(Caffe::root_solver());
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype delta = this->param_.delta();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_add(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        this->history_[param_id]->cpu_data(),
+        this->history_[param_id]->mutable_cpu_data());
+
+    // prepare update
+    caffe_powx(net_params[param_id]->count(),
+              this->history_[param_id]->cpu_data(), Dtype(0.5),
+              this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add_scalar(net_params[param_id]->count(),
+              delta, this->update_[param_id]->mutable_cpu_data());
+
+    caffe_div(net_params[param_id]->count(),
+              net_params[param_id]->cpu_diff(),
+              this->update_[param_id]->cpu_data(),
+              this->update_[param_id]->mutable_cpu_data());
+
+    // scale and copy
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->cpu_data(), Dtype(0),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        this->history_[param_id]->gpu_data(),
+        this->history_[param_id]->mutable_gpu_data());
+
+    // prepare update
+    caffe_gpu_powx(net_params[param_id]->count(),
+              this->history_[param_id]->gpu_data(), Dtype(0.5),
+              this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add_scalar(net_params[param_id]->count(),
+              delta, this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_div(net_params[param_id]->count(),
+              net_params[param_id]->gpu_diff(),
+              this->update_[param_id]->gpu_data(),
+              this->update_[param_id]->mutable_gpu_data());
+
+    // scale and copy
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->gpu_data(), Dtype(0),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+INSTANTIATE_CLASS(AdaGradSolver);
+REGISTER_SOLVER_CLASS(AdaGrad);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/adam_solver.cpp b/src/caffe/solvers/adam_solver.cpp
new file mode 100644 (file)
index 0000000..cb0fbfe
--- /dev/null
@@ -0,0 +1,113 @@
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void AdamSolver<Dtype>::AdamPreSolve() {
+  // Add the extra history entries for Adam after those from
+  // SGDSolver::PreSolve
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  for (int i = 0; i < net_params.size(); ++i) {
+    const vector<int>& shape = net_params[i]->shape();
+    this->history_.push_back(
+            shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  }
+}
+
+template <typename Dtype>
+void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  const Dtype beta1 = this->param_.momentum();
+  const Dtype beta2 = this->param_.momentum2();
+
+  // we create aliases for convenience
+  size_t update_history_offset = net_params.size();
+  Blob<Dtype>* val_m = this->history_[param_id].get();
+  Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
+  Blob<Dtype>* val_t = this->temp_[param_id].get();
+
+  const int t = this->iter_  + 1;
+  const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) /
+      (Dtype(1.) - pow(beta1, t));
+  const int N = net_params[param_id]->count();
+  const Dtype eps_hat = this->param_.delta();
+
+  switch (Caffe::mode()) {
+    case Caffe::CPU: {
+    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
+    caffe_cpu_axpby(N, Dtype(1)-beta1,
+        net_params[param_id]->cpu_diff(), beta1,
+        val_m->mutable_cpu_data());
+
+    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
+    caffe_mul(N,
+        net_params[param_id]->cpu_diff(),
+        net_params[param_id]->cpu_diff(),
+    val_t->mutable_cpu_data());
+    caffe_cpu_axpby(N, Dtype(1)-beta2,
+        val_t->cpu_data(), beta2,
+        val_v->mutable_cpu_data());
+
+    // set update
+    caffe_powx(N,
+        val_v->cpu_data(), Dtype(0.5),
+        val_t->mutable_cpu_data());
+    caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data());
+    caffe_div(N,
+        val_m->cpu_data(),
+        val_t->cpu_data(),
+        val_t->mutable_cpu_data());
+
+    caffe_cpu_scale(N, local_rate*correction,
+        val_t->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
+    caffe_gpu_axpby(N, Dtype(1)-beta1,
+        net_params[param_id]->gpu_diff(), beta1,
+        val_m->mutable_gpu_data());
+
+    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
+    caffe_gpu_mul(N,
+        net_params[param_id]->gpu_diff(),
+        net_params[param_id]->gpu_diff(),
+        val_t->mutable_gpu_data());
+    caffe_gpu_axpby(N, Dtype(1)-beta2,
+        val_t->gpu_data(), beta2,
+        val_v->mutable_gpu_data());
+
+    // set update
+    caffe_gpu_powx(N,
+        val_v->gpu_data(), Dtype(0.5),
+        val_t->mutable_gpu_data());
+    caffe_gpu_add_scalar(N, eps_hat,
+        val_t->mutable_gpu_data());
+    caffe_gpu_div(N,
+        val_m->gpu_data(),
+        val_t->gpu_data(),
+        val_t->mutable_gpu_data());
+
+    caffe_gpu_scale(N, local_rate*correction,
+        val_t->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+INSTANTIATE_CLASS(AdamSolver);
+REGISTER_SOLVER_CLASS(Adam);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
new file mode 100644 (file)
index 0000000..34bf01e
--- /dev/null
@@ -0,0 +1,71 @@
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  CHECK(Caffe::root_solver());
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // save history momentum for stepping back
+    caffe_copy(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+              net_params[param_id]->cpu_diff(), momentum,
+              this->history_[param_id]->mutable_cpu_data());
+
+    // compute update: step back then over step
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+        this->history_[param_id]->cpu_data(), -momentum,
+        this->update_[param_id]->mutable_cpu_data());
+
+    // copy
+    caffe_copy(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // save history momentum for stepping back
+    caffe_copy(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+              net_params[param_id]->gpu_diff(), momentum,
+              this->history_[param_id]->mutable_gpu_data());
+
+    // compute update: step back then over step
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+        this->history_[param_id]->gpu_data(), -momentum,
+        this->update_[param_id]->mutable_gpu_data());
+
+    // copy
+    caffe_copy(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+INSTANTIATE_CLASS(NesterovSolver);
+REGISTER_SOLVER_CLASS(Nesterov);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/rmsprop_solver.cpp b/src/caffe/solvers/rmsprop_solver.cpp
new file mode 100644 (file)
index 0000000..c624767
--- /dev/null
@@ -0,0 +1,85 @@
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+
+  // get the learning rate
+  Dtype delta = this->param_.delta();
+  Dtype rms_decay = this->param_.rms_decay();
+  Dtype local_rate = rate * net_params_lr[param_id];
+
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_cpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_cpu_data());
+
+    // prepare update
+    caffe_powx(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_cpu_data());
+
+    caffe_div(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // scale and copy
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->cpu_data(), Dtype(0),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  case Caffe::GPU:
+#ifndef CPU_ONLY
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_gpu_data());
+
+    // prepare update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_div(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->gpu_data(), Dtype(0),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+INSTANTIATE_CLASS(RMSPropSolver);
+REGISTER_SOLVER_CLASS(RMSProp);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
new file mode 100644 (file)
index 0000000..32bf19b
--- /dev/null
@@ -0,0 +1,348 @@
+#include <string>
+#include <vector>
+
+#include "caffe/sgd_solvers.hpp"
+#include "caffe/util/hdf5.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/upgrade_proto.hpp"
+
+namespace caffe {
+
+// Return the current learning rate. The currently implemented learning rate
+// policies are as follows:
+//    - fixed: always return base_lr.
+//    - step: return base_lr * gamma ^ (floor(iter / step))
+//    - exp: return base_lr * gamma ^ iter
+//    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+//    - multistep: similar to step but it allows non uniform steps defined by
+//      stepvalue
+//    - poly: the effective learning rate follows a polynomial decay, to be
+//      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+//    - sigmoid: the effective learning rate follows a sigmod decay
+//      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+//
+// where base_lr, max_iter, gamma, step, stepvalue and power are defined
+// in the solver parameter protocol buffer, and iter is the current iteration.
+template <typename Dtype>
+Dtype SGDSolver<Dtype>::GetLearningRate() {
+  Dtype rate;
+  const string& lr_policy = this->param_.lr_policy();
+  if (lr_policy == "fixed") {
+    rate = this->param_.base_lr();
+  } else if (lr_policy == "step") {
+    this->current_step_ = this->iter_ / this->param_.stepsize();
+    rate = this->param_.base_lr() *
+        pow(this->param_.gamma(), this->current_step_);
+  } else if (lr_policy == "exp") {
+    rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
+  } else if (lr_policy == "inv") {
+    rate = this->param_.base_lr() *
+        pow(Dtype(1) + this->param_.gamma() * this->iter_,
+            - this->param_.power());
+  } else if (lr_policy == "multistep") {
+    if (this->current_step_ < this->param_.stepvalue_size() &&
+          this->iter_ >= this->param_.stepvalue(this->current_step_)) {
+      this->current_step_++;
+      LOG(INFO) << "MultiStep Status: Iteration " <<
+      this->iter_ << ", step = " << this->current_step_;
+    }
+    rate = this->param_.base_lr() *
+        pow(this->param_.gamma(), this->current_step_);
+  } else if (lr_policy == "poly") {
+    rate = this->param_.base_lr() * pow(Dtype(1.) -
+        (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
+        this->param_.power());
+  } else if (lr_policy == "sigmoid") {
+    rate = this->param_.base_lr() * (Dtype(1.) /
+        (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
+          Dtype(this->param_.stepsize())))));
+  } else {
+    LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
+  }
+  return rate;
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::PreSolve() {
+  // Initialize the history
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  history_.clear();
+  update_.clear();
+  temp_.clear();
+  for (int i = 0; i < net_params.size(); ++i) {
+    const vector<int>& shape = net_params[i]->shape();
+    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::ClipGradients() {
+  const Dtype clip_gradients = this->param_.clip_gradients();
+  if (clip_gradients < 0) { return; }
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  Dtype sumsq_diff = 0;
+  for (int i = 0; i < net_params.size(); ++i) {
+    sumsq_diff += net_params[i]->sumsq_diff();
+  }
+  const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+  if (l2norm_diff > clip_gradients) {
+    Dtype scale_factor = clip_gradients / l2norm_diff;
+    LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
+        << l2norm_diff << " > " << clip_gradients << ") "
+        << "by scale factor " << scale_factor;
+    for (int i = 0; i < net_params.size(); ++i) {
+      net_params[i]->scale_diff(scale_factor);
+    }
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::ApplyUpdate() {
+  CHECK(Caffe::root_solver());
+  Dtype rate = GetLearningRate();
+  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+  }
+  ClipGradients();
+  for (int param_id = 0; param_id < this->net_->learnable_params().size();
+       ++param_id) {
+    Normalize(param_id);
+    Regularize(param_id);
+    ComputeUpdateValue(param_id, rate);
+  }
+  this->net_->Update();
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::Normalize(int param_id) {
+  if (this->param_.iter_size() == 1) { return; }
+  // Scale gradient to counterbalance accumulation.
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::Regularize(int param_id) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_weight_decay =
+      this->net_->params_weight_decay();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    if (local_decay) {
+      if (regularization_type == "L2") {
+        // add weight decay
+        caffe_axpy(net_params[param_id]->count(),
+            local_decay,
+            net_params[param_id]->cpu_data(),
+            net_params[param_id]->mutable_cpu_diff());
+      } else if (regularization_type == "L1") {
+        caffe_cpu_sign(net_params[param_id]->count(),
+            net_params[param_id]->cpu_data(),
+            temp_[param_id]->mutable_cpu_data());
+        caffe_axpy(net_params[param_id]->count(),
+            local_decay,
+            temp_[param_id]->cpu_data(),
+            net_params[param_id]->mutable_cpu_diff());
+      } else {
+        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+      }
+    }
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    if (local_decay) {
+      if (regularization_type == "L2") {
+        // add weight decay
+        caffe_gpu_axpy(net_params[param_id]->count(),
+            local_decay,
+            net_params[param_id]->gpu_data(),
+            net_params[param_id]->mutable_gpu_diff());
+      } else if (regularization_type == "L1") {
+        caffe_gpu_sign(net_params[param_id]->count(),
+            net_params[param_id]->gpu_data(),
+            temp_[param_id]->mutable_gpu_data());
+        caffe_gpu_axpy(net_params[param_id]->count(),
+            local_decay,
+            temp_[param_id]->gpu_data(),
+            net_params[param_id]->mutable_gpu_diff());
+      } else {
+        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+      }
+    }
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  // Compute the update to history, then copy it to the parameter diff.
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+              net_params[param_id]->cpu_diff(), momentum,
+              history_[param_id]->mutable_cpu_data());
+    caffe_copy(net_params[param_id]->count(),
+        history_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+              net_params[param_id]->gpu_diff(), momentum,
+              history_[param_id]->mutable_gpu_data());
+    caffe_copy(net_params[param_id]->count(),
+        history_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::SnapshotSolverState(const string& model_filename) {
+  switch (this->param_.snapshot_format()) {
+    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
+      SnapshotSolverStateToBinaryProto(model_filename);
+      break;
+    case caffe::SolverParameter_SnapshotFormat_HDF5:
+      SnapshotSolverStateToHDF5(model_filename);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported snapshot format.";
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
+    const string& model_filename) {
+  SolverState state;
+  state.set_iter(this->iter_);
+  state.set_learned_net(model_filename);
+  state.set_current_step(this->current_step_);
+  state.clear_history();
+  for (int i = 0; i < history_.size(); ++i) {
+    // Add history
+    BlobProto* history_blob = state.add_history();
+    history_[i]->ToProto(history_blob);
+  }
+  string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
+  LOG(INFO)
+    << "Snapshotting solver state to binary proto file " << snapshot_filename;
+  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
+    const string& model_filename) {
+  string snapshot_filename =
+      Solver<Dtype>::SnapshotFilename(".solverstate.h5");
+  LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
+  hid_t file_hid = H5Fcreate(snapshot_filename.c_str(), H5F_ACC_TRUNC,
+      H5P_DEFAULT, H5P_DEFAULT);
+  CHECK_GE(file_hid, 0)
+      << "Couldn't open " << snapshot_filename << " to save solver state.";
+  hdf5_save_int(file_hid, "iter", this->iter_);
+  hdf5_save_string(file_hid, "learned_net", model_filename);
+  hdf5_save_int(file_hid, "current_step", this->current_step_);
+  hid_t history_hid = H5Gcreate2(file_hid, "history", H5P_DEFAULT, H5P_DEFAULT,
+      H5P_DEFAULT);
+  CHECK_GE(history_hid, 0)
+      << "Error saving solver state to " << snapshot_filename << ".";
+  for (int i = 0; i < history_.size(); ++i) {
+    ostringstream oss;
+    oss << i;
+    hdf5_save_nd_dataset<Dtype>(history_hid, oss.str(), *history_[i]);
+  }
+  H5Gclose(history_hid);
+  H5Fclose(file_hid);
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
+    const string& state_file) {
+  SolverState state;
+  ReadProtoFromBinaryFile(state_file, &state);
+  this->iter_ = state.iter();
+  if (state.has_learned_net()) {
+    NetParameter net_param;
+    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
+    this->net_->CopyTrainedLayersFrom(net_param);
+  }
+  this->current_step_ = state.current_step();
+  CHECK_EQ(state.history_size(), history_.size())
+      << "Incorrect length of history blobs.";
+  LOG(INFO) << "SGDSolver: restoring history";
+  for (int i = 0; i < history_.size(); ++i) {
+    history_[i]->FromProto(state.history(i));
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
+  hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
+  this->iter_ = hdf5_load_int(file_hid, "iter");
+  if (H5LTfind_dataset(file_hid, "learned_net")) {
+    string learned_net = hdf5_load_string(file_hid, "learned_net");
+    this->net_->CopyTrainedLayersFrom(learned_net);
+  }
+  this->current_step_ = hdf5_load_int(file_hid, "current_step");
+  hid_t history_hid = H5Gopen2(file_hid, "history", H5P_DEFAULT);
+  CHECK_GE(history_hid, 0) << "Error reading history from " << state_file;
+  int state_history_size = hdf5_get_num_links(history_hid);
+  CHECK_EQ(state_history_size, history_.size())
+      << "Incorrect length of history blobs.";
+  for (int i = 0; i < history_.size(); ++i) {
+    ostringstream oss;
+    oss << i;
+    hdf5_load_nd_dataset<Dtype>(history_hid, oss.str().c_str(), 0,
+                                kMaxBlobAxes, history_[i].get());
+  }
+  H5Gclose(history_hid);
+  H5Fclose(file_hid);
+}
+
+INSTANTIATE_CLASS(SGDSolver);
+REGISTER_SOLVER_CLASS(SGD);
+
+}  // namespace caffe
index a667a86..632bf1f 100644 (file)
@@ -8,7 +8,7 @@ namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
   if (cpu_ptr_ && own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
+    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
 
 #ifndef CPU_ONLY
@@ -27,7 +27,7 @@ SyncedMemory::~SyncedMemory() {
 inline void SyncedMemory::to_cpu() {
   switch (head_) {
   case UNINITIALIZED:
-    CaffeMallocHost(&cpu_ptr_, size_);
+    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
     caffe_memset(size_, 0, cpu_ptr_);
     head_ = HEAD_AT_CPU;
     own_cpu_data_ = true;
@@ -35,7 +35,7 @@ inline void SyncedMemory::to_cpu() {
   case HEAD_AT_GPU:
 #ifndef CPU_ONLY
     if (cpu_ptr_ == NULL) {
-      CaffeMallocHost(&cpu_ptr_, size_);
+      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
       own_cpu_data_ = true;
     }
     caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
@@ -86,7 +86,7 @@ const void* SyncedMemory::cpu_data() {
 void SyncedMemory::set_cpu_data(void* data) {
   CHECK(data);
   if (own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
+    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
   cpu_ptr_ = data;
   head_ = HEAD_AT_CPU;
index 895c3d3..bbf1909 100644 (file)
@@ -16,7 +16,7 @@ template <typename Dtype>
 class ArgMaxLayerTest : public CPUDeviceTest<Dtype> {
  protected:
   ArgMaxLayerTest()
-      : blob_bottom_(new Blob<Dtype>(10, 20, 1, 1)),
+      : blob_bottom_(new Blob<Dtype>(10, 10, 20, 20)),
         blob_top_(new Blob<Dtype>()),
         top_k_(5) {
     Caffe::set_random_seed(1701);
@@ -55,6 +55,43 @@ TYPED_TEST(ArgMaxLayerTest, TestSetupMaxVal) {
   EXPECT_EQ(this->blob_top_->channels(), 2);
 }
 
+TYPED_TEST(ArgMaxLayerTest, TestSetupAxis) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(0);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->shape(0), argmax_param->top_k());
+  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(0));
+  EXPECT_EQ(this->blob_top_->shape(2), this->blob_bottom_->shape(2));
+  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
+}
+
+TYPED_TEST(ArgMaxLayerTest, TestSetupAxisNegativeIndexing) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(-2);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->shape(0), this->blob_bottom_->shape(0));
+  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(1));
+  EXPECT_EQ(this->blob_top_->shape(2), argmax_param->top_k());
+  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
+}
+
+TYPED_TEST(ArgMaxLayerTest, TestSetupAxisMaxVal) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(2);
+  argmax_param->set_out_max_val(true);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->shape(0), this->blob_bottom_->shape(0));
+  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(1));
+  EXPECT_EQ(this->blob_top_->shape(2), argmax_param->top_k());
+  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
+}
+
 TYPED_TEST(ArgMaxLayerTest, TestCPU) {
   LayerParameter layer_param;
   ArgMaxLayer<TypeParam> layer(layer_param);
@@ -112,6 +149,7 @@ TYPED_TEST(ArgMaxLayerTest, TestCPUTopK) {
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
   // Now, check values
+  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
   int max_ind;
   TypeParam max_val;
   int num = this->blob_bottom_->num();
@@ -121,10 +159,10 @@ TYPED_TEST(ArgMaxLayerTest, TestCPUTopK) {
     EXPECT_LE(this->blob_top_->data_at(i, 0, 0, 0), dim);
     for (int j = 0; j < this->top_k_; ++j) {
       max_ind = this->blob_top_->data_at(i, 0, j, 0);
-      max_val = this->blob_bottom_->data_at(i, max_ind, 0, 0);
+      max_val = bottom_data[i * dim + max_ind];
       int count = 0;
       for (int k = 0; k < dim; ++k) {
-        if (this->blob_bottom_->data_at(i, k, 0, 0) > max_val) {
+        if (bottom_data[i * dim + k] > max_val) {
           ++count;
         }
       }
@@ -142,6 +180,7 @@ TYPED_TEST(ArgMaxLayerTest, TestCPUMaxValTopK) {
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
   // Now, check values
+  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
   int max_ind;
   TypeParam max_val;
   int num = this->blob_bottom_->num();
@@ -152,10 +191,10 @@ TYPED_TEST(ArgMaxLayerTest, TestCPUMaxValTopK) {
     for (int j = 0; j < this->top_k_; ++j) {
       max_ind = this->blob_top_->data_at(i, 0, j, 0);
       max_val = this->blob_top_->data_at(i, 1, j, 0);
-      EXPECT_EQ(this->blob_bottom_->data_at(i, max_ind, 0, 0), max_val);
+      EXPECT_EQ(bottom_data[i * dim + max_ind], max_val);
       int count = 0;
       for (int k = 0; k < dim; ++k) {
-        if (this->blob_bottom_->data_at(i, k, 0, 0) > max_val) {
+        if (bottom_data[i * dim + k] > max_val) {
           ++count;
         }
       }
@@ -164,5 +203,93 @@ TYPED_TEST(ArgMaxLayerTest, TestCPUMaxValTopK) {
   }
 }
 
+TYPED_TEST(ArgMaxLayerTest, TestCPUAxis) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(0);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  int max_ind;
+  TypeParam max_val;
+  std::vector<int> shape = this->blob_bottom_->shape();
+  for (int i = 0; i < shape[1]; ++i) {
+    for (int j = 0; j < shape[2]; ++j) {
+      for (int k = 0; k < shape[3]; ++k) {
+        max_ind = this->blob_top_->data_at(0, i, j, k);
+        max_val = this->blob_bottom_->data_at(max_ind, i, j, k);
+        EXPECT_GE(max_ind, 0);
+        EXPECT_LE(max_ind, shape[0]);
+        for (int l = 0; l < shape[0]; ++l) {
+          EXPECT_LE(this->blob_bottom_->data_at(l, i, j, k), max_val);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ArgMaxLayerTest, TestCPUAxisTopK) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(2);
+  argmax_param->set_top_k(this->top_k_);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  int max_ind;
+  TypeParam max_val;
+  std::vector<int> shape = this->blob_bottom_->shape();
+  for (int i = 0; i < shape[0]; ++i) {
+    for (int j = 0; j < shape[1]; ++j) {
+      for (int k = 0; k < shape[3]; ++k) {
+        for (int m = 0; m < this->top_k_; ++m) {
+          max_ind = this->blob_top_->data_at(i, j, m, k);
+          max_val = this->blob_bottom_->data_at(i, j, max_ind, k);
+          EXPECT_GE(max_ind, 0);
+          EXPECT_LE(max_ind, shape[2]);
+          int count = 0;
+          for (int l = 0; l < shape[2]; ++l) {
+            if (this->blob_bottom_->data_at(i, j, l, k) > max_val) {
+              ++count;
+            }
+          }
+          EXPECT_EQ(m, count);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ArgMaxLayerTest, TestCPUAxisMaxValTopK) {
+  LayerParameter layer_param;
+  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
+  argmax_param->set_axis(-1);
+  argmax_param->set_top_k(this->top_k_);
+  argmax_param->set_out_max_val(true);
+  ArgMaxLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  TypeParam max_val;
+  std::vector<int> shape = this->blob_bottom_->shape();
+  for (int i = 0; i < shape[0]; ++i) {
+    for (int j = 0; j < shape[1]; ++j) {
+      for (int k = 0; k < shape[2]; ++k) {
+        for (int m = 0; m < this->top_k_; ++m) {
+          max_val = this->blob_top_->data_at(i, j, k, m);
+          int count = 0;
+          for (int l = 0; l < shape[3]; ++l) {
+            if (this->blob_bottom_->data_at(i, j, k, l) > max_val) {
+              ++count;
+            }
+          }
+          EXPECT_EQ(m, count);
+        }
+      }
+    }
+  }
+}
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_batch_reindex_layer.cpp b/src/caffe/test/test_batch_reindex_layer.cpp
new file mode 100644 (file)
index 0000000..985db34
--- /dev/null
@@ -0,0 +1,119 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template<typename TypeParam>
+class BatchReindexLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  BatchReindexLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_bottom_permute_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {
+  }
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    vector<int> sz;
+    sz.push_back(5);
+    sz.push_back(4);
+    sz.push_back(3);
+    sz.push_back(2);
+    blob_bottom_->Reshape(sz);
+    vector<int> permsz;
+    permsz.push_back(6);
+    blob_bottom_permute_->Reshape(permsz);
+
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    int perm[] = { 4, 0, 4, 0, 1, 2 };
+    for (int i = 0; i < blob_bottom_permute_->count(); ++i) {
+      blob_bottom_permute_->mutable_cpu_data()[i] = perm[i];
+    }
+
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_permute_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~BatchReindexLayerTest() {
+    delete blob_bottom_permute_;
+    delete blob_bottom_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_permute_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  void TestForward() {
+    LayerParameter layer_param;
+
+    vector<int> sz;
+    sz.push_back(5);
+    sz.push_back(4);
+    sz.push_back(3);
+    sz.push_back(2);
+    blob_bottom_->Reshape(sz);
+    for (int i = 0; i < blob_bottom_->count(); ++i) {
+      blob_bottom_->mutable_cpu_data()[i] = i;
+    }
+
+    vector<int> permsz;
+    permsz.push_back(6);
+    blob_bottom_permute_->Reshape(permsz);
+    int perm[] = { 4, 0, 4, 0, 1, 2 };
+    for (int i = 0; i < blob_bottom_permute_->count(); ++i) {
+      blob_bottom_permute_->mutable_cpu_data()[i] = perm[i];
+    }
+    BatchReindexLayer<Dtype> layer(layer_param);
+    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
+    EXPECT_EQ(blob_top_->num(), blob_bottom_permute_->num());
+    EXPECT_EQ(blob_top_->channels(), blob_bottom_->channels());
+    EXPECT_EQ(blob_top_->height(), blob_bottom_->height());
+    EXPECT_EQ(blob_top_->width(), blob_bottom_->width());
+
+    layer.Forward(blob_bottom_vec_, blob_top_vec_);
+    int channels = blob_top_->channels();
+    int height = blob_top_->height();
+    int width = blob_top_->width();
+    for (int i = 0; i < blob_top_->count(); ++i) {
+      int n = i / (channels * width * height);
+      int inner_idx = (i % (channels * width * height));
+      EXPECT_EQ(
+          blob_top_->cpu_data()[i],
+          blob_bottom_->cpu_data()[perm[n] * channels * width * height
+              + inner_idx]);
+    }
+  }
+};
+
+TYPED_TEST_CASE(BatchReindexLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(BatchReindexLayerTest, TestForward) {
+  this->TestForward();
+}
+
+TYPED_TEST(BatchReindexLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BatchReindexLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-4, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  }
+
+}  // namespace caffe
index 7ad7467..84c6747 100644 (file)
@@ -10,7 +10,7 @@
 #include "caffe/common.hpp"
 #include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
-#include "caffe/solver.hpp"
+#include "caffe/sgd_solvers.hpp"
 #include "caffe/util/io.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -47,7 +47,6 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   // Test data: check out generate_sample_data.py in the same directory.
   string* input_file_;
 
-  virtual SolverParameter_SolverType solver_type() = 0;
   virtual void InitSolver(const SolverParameter& param) = 0;
 
   virtual void InitSolverFromProtoString(const string& proto) {
@@ -290,8 +289,8 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
           ((i == D) ? bias.cpu_data()[0] : weights.cpu_data()[i]);
       // Finally, compute update.
       const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
-      if (solver_type() != SolverParameter_SolverType_ADADELTA
-          && solver_type() != SolverParameter_SolverType_ADAM) {
+      if (solver_->type() != string("AdaDelta")
+          && solver_->type() != string("Adam")) {
         ASSERT_EQ(2, history.size());  // 1 blob for weights, 1 for bias
       } else {
         ASSERT_EQ(4, history.size());  // additional blobs for update history
@@ -300,26 +299,19 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       const Dtype history_value = (i == D) ?
             history[1]->cpu_data()[0] : history[0]->cpu_data()[i];
       const Dtype temp = momentum * history_value;
-      switch (solver_type()) {
-      case SolverParameter_SolverType_SGD:
+      if (solver_->type() == string("SGD")) {
         update_value += temp;
-        break;
-      case SolverParameter_SolverType_NESTEROV:
+      } else if (solver_->type() == string("Nesterov")) {
         update_value += temp;
         // step back then over-step
         update_value = (1 + momentum) * update_value - temp;
-        break;
-      case SolverParameter_SolverType_ADAGRAD:
+      } else if (solver_->type() == string("AdaGrad")) {
         update_value /= std::sqrt(history_value + grad * grad) + delta_;
-        break;
-      case SolverParameter_SolverType_RMSPROP: {
+      } else if (solver_->type() == string("RMSProp")) {
         const Dtype rms_decay = 0.95;
         update_value /= std::sqrt(rms_decay*history_value
             + grad * grad * (1 - rms_decay)) + delta_;
-        }
-        break;
-      case SolverParameter_SolverType_ADADELTA:
-      {
+      } else if (solver_->type() == string("AdaDelta")) {
         const Dtype update_history_value = (i == D) ?
             history[1 + num_param_blobs]->cpu_data()[0] :
             history[0 + num_param_blobs]->cpu_data()[i];
@@ -330,9 +322,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
         // not actually needed, just here for illustrative purposes
         // const Dtype weighted_update_average =
         //   momentum * update_history_value + (1 - momentum) * (update_value);
-        break;
-      }
-      case SolverParameter_SolverType_ADAM: {
+      } else if (solver_->type() == string("Adam")) {
         const Dtype momentum2 = 0.999;
         const Dtype m = history_value;
         const Dtype v = (i == D) ?
@@ -344,10 +334,8 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
             std::sqrt(Dtype(1) - pow(momentum2, num_iters)) /
             (Dtype(1.) - pow(momentum, num_iters));
         update_value = alpha_t * val_m / (std::sqrt(val_v) + delta_);
-        break;
-      }
-      default:
-        LOG(FATAL) << "Unknown solver type: " << solver_type();
+      } else {
+        LOG(FATAL) << "Unknown solver type: " << solver_->type();
       }
       if (i == D) {
         updated_bias.mutable_cpu_diff()[0] = update_value;
@@ -392,7 +380,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     EXPECT_NEAR(expected_updated_bias, solver_updated_bias, error_margin);
 
     // Check the solver's history -- should contain the previous update value.
-    if (solver_type() == SolverParameter_SolverType_SGD) {
+    if (solver_->type() == string("SGD")) {
       const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
       ASSERT_EQ(2, history.size());
       for (int i = 0; i < D; ++i) {
@@ -581,10 +569,6 @@ class SGDSolverTest : public GradientBasedSolverTest<TypeParam> {
   virtual void InitSolver(const SolverParameter& param) {
     this->solver_.reset(new SGDSolver<Dtype>(param));
   }
-
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_SGD;
-  }
 };
 
 TYPED_TEST_CASE(SGDSolverTest, TestDtypesAndDevices);
@@ -721,9 +705,6 @@ class AdaGradSolverTest : public GradientBasedSolverTest<TypeParam> {
   virtual void InitSolver(const SolverParameter& param) {
     this->solver_.reset(new AdaGradSolver<Dtype>(param));
   }
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_ADAGRAD;
-  }
 };
 
 TYPED_TEST_CASE(AdaGradSolverTest, TestDtypesAndDevices);
@@ -824,9 +805,6 @@ class NesterovSolverTest : public GradientBasedSolverTest<TypeParam> {
   virtual void InitSolver(const SolverParameter& param) {
     this->solver_.reset(new NesterovSolver<Dtype>(param));
   }
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_NESTEROV;
-  }
 };
 
 TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices);
@@ -960,10 +938,6 @@ class AdaDeltaSolverTest : public GradientBasedSolverTest<TypeParam> {
   virtual void InitSolver(const SolverParameter& param) {
     this->solver_.reset(new AdaDeltaSolver<Dtype>(param));
   }
-
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_ADADELTA;
-  }
 };
 
 TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices);
@@ -1098,9 +1072,6 @@ class AdamSolverTest : public GradientBasedSolverTest<TypeParam> {
     new_param.set_momentum2(momentum2);
     this->solver_.reset(new AdamSolver<Dtype>(new_param));
   }
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_ADAM;
-  }
 };
 
 TYPED_TEST_CASE(AdamSolverTest, TestDtypesAndDevices);
@@ -1201,9 +1172,6 @@ class RMSPropSolverTest : public GradientBasedSolverTest<TypeParam> {
     new_param.set_rms_decay(rms_decay);
     this->solver_.reset(new RMSPropSolver<Dtype>(new_param));
   }
-  virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_RMSPROP;
-  }
 };
 
 TYPED_TEST_CASE(RMSPropSolverTest, TestDtypesAndDevices);
index c4e2f8e..78cf2d9 100644 (file)
@@ -246,5 +246,201 @@ TYPED_TEST(LRNLayerTest, TestGradientWithinChannel) {
       this->blob_top_vec_);
 }
 
+#ifdef USE_CUDNN
+template <typename Dtype>
+class CuDNNLRNLayerTest : public GPUDeviceTest<Dtype> {
+ protected:
+  CuDNNLRNLayerTest()
+      : epsilon_(Dtype(1e-5)),
+        blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    blob_bottom_->Reshape(2, 7, 3, 3);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~CuDNNLRNLayerTest() { delete blob_bottom_; delete blob_top_; }
+  void ReferenceLRNForward(const Blob<Dtype>& blob_bottom,
+      const LayerParameter& layer_param, Blob<Dtype>* blob_top);
+
+  Dtype epsilon_;
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+template <typename TypeParam>
+void CuDNNLRNLayerTest<TypeParam>::ReferenceLRNForward(
+    const Blob<TypeParam>& blob_bottom, const LayerParameter& layer_param,
+    Blob<TypeParam>* blob_top) {
+  typedef TypeParam Dtype;
+  blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(),
+      blob_bottom.height(), blob_bottom.width());
+  Dtype* top_data = blob_top->mutable_cpu_data();
+  LRNParameter lrn_param = layer_param.lrn_param();
+  Dtype alpha = lrn_param.alpha();
+  Dtype beta = lrn_param.beta();
+  int size = lrn_param.local_size();
+  switch (lrn_param.norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    for (int n = 0; n < blob_bottom.num(); ++n) {
+      for (int c = 0; c < blob_bottom.channels(); ++c) {
+        for (int h = 0; h < blob_bottom.height(); ++h) {
+          for (int w = 0; w < blob_bottom.width(); ++w) {
+            int c_start = c - (size - 1) / 2;
+            int c_end = min(c_start + size, blob_bottom.channels());
+            c_start = max(c_start, 0);
+            Dtype scale = 1.;
+            for (int i = c_start; i < c_end; ++i) {
+              Dtype value = blob_bottom.data_at(n, i, h, w);
+              scale += value * value * alpha / size;
+            }
+            *(top_data + blob_top->offset(n, c, h, w)) =
+              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
+          }
+        }
+      }
+    }
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    for (int n = 0; n < blob_bottom.num(); ++n) {
+      for (int c = 0; c < blob_bottom.channels(); ++c) {
+        for (int h = 0; h < blob_bottom.height(); ++h) {
+          int h_start = h - (size - 1) / 2;
+          int h_end = min(h_start + size, blob_bottom.height());
+          h_start = max(h_start, 0);
+          for (int w = 0; w < blob_bottom.width(); ++w) {
+            Dtype scale = 1.;
+            int w_start = w - (size - 1) / 2;
+            int w_end = min(w_start + size, blob_bottom.width());
+            w_start = max(w_start, 0);
+            for (int nh = h_start; nh < h_end; ++nh) {
+              for (int nw = w_start; nw < w_end; ++nw) {
+                Dtype value = blob_bottom.data_at(n, c, nh, nw);
+                scale += value * value * alpha / (size * size);
+              }
+            }
+            *(top_data + blob_top->offset(n, c, h, w)) =
+              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
+          }
+        }
+      }
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+TYPED_TEST_CASE(CuDNNLRNLayerTest, TestDtypes);
+
+TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsCuDNN) {
+  // typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CuDNNLRNLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  Blob<TypeParam> top_reference;
+  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+      &top_reference);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+                this->epsilon_);
+  }
+}
+
+TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsLargeRegionCuDNN) {
+  typedef TypeParam Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_lrn_param()->set_local_size(15);
+  CuDNNLRNLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  Blob<Dtype> top_reference;
+  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+      &top_reference);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+                this->epsilon_);
+  }
+}
+
+TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsCuDNN) {
+  typedef TypeParam Dtype;
+  LayerParameter layer_param;
+  CuDNNLRNLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = 1.;
+  }
+  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
+  layer.Backward(this->blob_top_vec_, propagate_down,
+                 this->blob_bottom_vec_);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(CuDNNLRNLayerTest, TestForwardWithinChannel) {
+  typedef TypeParam Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_lrn_param()->set_norm_region(
+      LRNParameter_NormRegion_WITHIN_CHANNEL);
+  layer_param.mutable_lrn_param()->set_local_size(3);
+  CuDNNLCNLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  Blob<Dtype> top_reference;
+  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+      &top_reference);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+                this->epsilon_);
+  }
+}
+
+TYPED_TEST(CuDNNLRNLayerTest, TestGradientWithinChannel) {
+  typedef TypeParam Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_lrn_param()->set_norm_region(
+      LRNParameter_NormRegion_WITHIN_CHANNEL);
+  layer_param.mutable_lrn_param()->set_local_size(3);
+  CuDNNLCNLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = 1.;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsLargeRegionCuDNN) {
+  typedef TypeParam Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_lrn_param()->set_local_size(15);
+  CuDNNLRNLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    this->blob_top_->mutable_cpu_diff()[i] = 1.;
+  }
+  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
+  layer.Backward(this->blob_top_vec_, propagate_down,
+                 this->blob_bottom_vec_);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+#endif
 
 }  // namespace caffe
index ceabc9c..b181642 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/sgd_solvers.hpp"
 #include "caffe/solver.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
diff --git a/src/caffe/test/test_solver_factory.cpp b/src/caffe/test/test_solver_factory.cpp
new file mode 100644 (file)
index 0000000..eef5290
--- /dev/null
@@ -0,0 +1,50 @@
+#include <map>
+#include <string>
+
+#include "boost/scoped_ptr.hpp"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+
+#include "caffe/common.hpp"
+#include "caffe/solver.hpp"
+#include "caffe/solver_factory.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class SolverFactoryTest : public MultiDeviceTest<TypeParam> {
+ protected:
+  SolverParameter simple_solver_param() {
+    const string solver_proto =
+        "train_net_param { "
+        "  layer { "
+        "    name: 'data' type: 'DummyData' top: 'data' "
+        "    dummy_data_param { shape { dim: 1 } } "
+        "  } "
+        "} ";
+    SolverParameter solver_param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(
+        solver_proto, &solver_param));
+    return solver_param;
+  }
+};
+
+TYPED_TEST_CASE(SolverFactoryTest, TestDtypesAndDevices);
+
+TYPED_TEST(SolverFactoryTest, TestCreateSolver) {
+  typedef typename TypeParam::Dtype Dtype;
+  typename SolverRegistry<Dtype>::CreatorRegistry& registry =
+      SolverRegistry<Dtype>::Registry();
+  shared_ptr<Solver<Dtype> > solver;
+  SolverParameter solver_param = this->simple_solver_param();
+  for (typename SolverRegistry<Dtype>::CreatorRegistry::iterator iter =
+       registry.begin(); iter != registry.end(); ++iter) {
+    solver_param.set_type(iter->first);
+    solver.reset(SolverRegistry<Dtype>::CreateSolver(solver_param));
+    EXPECT_EQ(iter->first, solver->type());
+  }
+}
+
+}  // namespace caffe
index ee05b15..23deddd 100644 (file)
@@ -2892,7 +2892,6 @@ TEST_F(NetUpgradeTest, TestImageNet) {
   this->RunV1UpgradeTest(expected_v1_proto, expected_v2_proto);
 }  // NOLINT(readability/fn_size)
 
-#ifdef USE_OPENCV
 TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
   LayerParameter layer_param;
   shared_ptr<Layer<float> > layer;
@@ -2927,5 +2926,65 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
     EXPECT_EQ(v2_layer_type, layer->type());
   }
 }
-#endif  // USE_OPENCV
+
+class SolverTypeUpgradeTest : public ::testing::Test {
+ protected:
+  void RunSolverTypeUpgradeTest(
+      const string& input_param_string, const string& output_param_string) {
+    // Test upgrading old solver_type field (enum) to new type field (string)
+    SolverParameter input_param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(
+        input_param_string, &input_param));
+    SolverParameter expected_output_param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(
+        output_param_string, &expected_output_param));
+    SolverParameter actual_output_param = input_param;
+    UpgradeSolverType(&actual_output_param);
+    EXPECT_EQ(expected_output_param.DebugString(),
+        actual_output_param.DebugString());
+  }
+};
+
+TEST_F(SolverTypeUpgradeTest, TestSimple) {
+  const char* old_type_vec[6] = { "SGD", "ADAGRAD", "NESTEROV", "RMSPROP",
+      "ADADELTA", "ADAM" };
+  const char* new_type_vec[6] = { "SGD", "AdaGrad", "Nesterov", "RMSProp",
+      "AdaDelta", "Adam" };
+  for (int i = 0; i < 6; ++i) {
+    const string& input_proto =
+        "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "test_iter: 100 "
+        "test_interval: 500 "
+        "base_lr: 0.01 "
+        "momentum: 0.0 "
+        "weight_decay: 0.0005 "
+        "lr_policy: 'inv' "
+        "gamma: 0.0001 "
+        "power: 0.75 "
+        "display: 100 "
+        "max_iter: 10000 "
+        "snapshot: 5000 "
+        "snapshot_prefix: 'examples/mnist/lenet_rmsprop' "
+        "solver_mode: GPU "
+        "solver_type: " + std::string(old_type_vec[i]) + " ";
+    const string& expected_output_proto =
+        "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "test_iter: 100 "
+        "test_interval: 500 "
+        "base_lr: 0.01 "
+        "momentum: 0.0 "
+        "weight_decay: 0.0005 "
+        "lr_policy: 'inv' "
+        "gamma: 0.0001 "
+        "power: 0.75 "
+        "display: 100 "
+        "max_iter: 10000 "
+        "snapshot: 5000 "
+        "snapshot_prefix: 'examples/mnist/lenet_rmsprop' "
+        "solver_mode: GPU "
+        "type: '" + std::string(new_type_vec[i]) + "' ";
+    this->RunSolverTypeUpgradeTest(input_proto, expected_output_proto);
+  }
+}
+
 }  // NOLINT(readability/fn_size)  // namespace caffe
index b0a7be5..09da23d 100644 (file)
@@ -14,22 +14,20 @@ void im2col_cpu(const Dtype* data_im, const int channels,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     Dtype* data_col) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_h * kernel_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % kernel_w;
-    int h_offset = (c / kernel_w) % kernel_h;
-    int c_im = c / kernel_h / kernel_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(c * height_col + h) * width_col + w] =
-            data_im[(c_im * height + h_pad) * width + w_pad];
-        else
-          data_col[(c * height_col + h) * width_col + w] = 0;
+  const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset;
+        int w_im = w_col * stride_w - pad_w + w_offset;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+            data_im[(c_im * height + h_im) * width + w_im] : 0;
       }
     }
   }
@@ -64,9 +62,9 @@ inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
   const int channels_col = col_shape[0];
   vector<int> d_offset(num_spatial_axes, 0);
   vector<int> d_iter(num_spatial_axes, 0);
-  for (int c = 0; c < channels_col; ++c) {
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
     // Loop over spatial axes in reverse order to compute a per-axis offset.
-    int offset = c;
+    int offset = c_col;
     for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
       if (d_i < num_spatial_axes - 1) {
         offset /= kernel_shape[d_i + 1];
@@ -76,17 +74,17 @@ inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
     for (bool incremented = true; incremented; ) {
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
-      int index_col = c;
-      int index_im = c / kernel_size;
+      int index_col = c_col;
+      int index_im = c_col / kernel_size;
       bool is_padding = false;
       for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
         const int d = d_iter[d_i];
-        const int d_pad = d * stride[d_i] - pad[d_i] + d_offset[d_i];
-        is_padding |= d_pad < 0 || d_pad >= im_shape[d_i + 1];
+        const int d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i];
+        is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
         index_col *= col_shape[d_i + 1];
         index_col += d;
         index_im *= im_shape[d_i + 1];
-        index_im += d_pad;
+        index_im += d_im;
       }
       if (im2col) {
         if (is_padding) {
@@ -139,25 +137,25 @@ template void im2col_nd_cpu<double>(const double* data_im,
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     Dtype* data_im) {
   caffe_set(height * width * channels, Dtype(0), data_im);
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_h * patch_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int c_im = c / patch_h / patch_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(c_im * height + h_pad) * width + w_pad] +=
-              data_col[(c * height_col + h) * width_col + w];
+  const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset;
+        int w_im = w_col * stride_w - pad_w + w_offset;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+              data_col[(c_col * height_col + h_col) * width_col + w_col];
       }
     }
   }
@@ -165,11 +163,11 @@ void col2im_cpu(const Dtype* data_col, const int channels,
 
 // Explicit instantiation
 template void col2im_cpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
index 5a478ba..451097f 100644 (file)
@@ -16,22 +16,23 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     const int height_col, const int width_col,
     Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
     Dtype* data_col_ptr = data_col;
-    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
     const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + i;
-        int w = w_in + j;
-        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+        int h_im = h_offset + i;
+        int w_im = w_offset + j;
+        *data_col_ptr =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
             data_im_ptr[i * width + j] : 0;
         data_col_ptr += height_col * width_col;
       }
@@ -222,35 +223,39 @@ template void im2col_nd_gpu<double>(const double* data_im,
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
+    const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int height_col, const int width_col,
     Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
     Dtype val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
     // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
+    const int w_col_start =
+        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const int w_col_end =
+        min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const int h_col_end =
+        min(h_im / stride_h + 1, height_col);
     /*
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
+        int c_col = c_im * kernel_h * kernel_w
+            + (h_im - h_col * stride_h) * kernel_w + (w_im - w_col * stride_w);
         val += data_col[(c_col * height_col + h_col) * width_col + w_col];
       }
     }
     */
     // equivalent implementation
-    int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int offset = (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im)
+        * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
     int coeff_w_col = (1 - stride_w * height_col * width_col);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
@@ -263,18 +268,18 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
   int num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
   col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                              CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
       pad_h, pad_w, stride_h, stride_w,
       height_col, width_col, data_im);
   CUDA_POST_KERNEL_CHECK;
@@ -282,11 +287,11 @@ void col2im_gpu(const Dtype* data_col, const int channels,
 
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
@@ -302,11 +307,11 @@ __global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
   CUDA_KERNEL_LOOP(index, n) {
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
-    int channel_im = index;
+    int c_im = index;
     // Calculate d_im (image dimensions).
     for (int i = num_axes - 1; i >= 0; --i) {
-      d_im[i] = channel_im % im_shape[i + 1] + pad[i];
-      channel_im /= im_shape[i + 1];
+      d_im[i] = c_im % im_shape[i + 1] + pad[i];
+      c_im /= im_shape[i + 1];
     }
     // Calculate col start/end indices.
     bool done = false;
@@ -338,7 +343,7 @@ __global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
             (d_im[i] - d_col_iter[i] * stride[i]) * kernel_shape_prod;
         kernel_shape_prod *= kernel_shape[i];
       }
-      final_offset += kernel_shape_prod * channel_im;
+      final_offset += kernel_shape_prod * c_im;
       for (int i = 0; i < num_axes; ++i) {
         final_offset *= col_shape[i + 1];
         final_offset += d_col_iter[i];
index ac379e5..ff3f8ff 100644 (file)
@@ -16,6 +16,67 @@ bool NetNeedsUpgrade(const NetParameter& net_param) {
   return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param);
 }
 
+bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
+  bool success = true;
+  if (NetNeedsV0ToV1Upgrade(*param)) {
+    // NetParameter was specified using the old style (V0LayerParameter); try to
+    // upgrade it.
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "V0LayerParameter: " << param_file;
+    NetParameter original_param(*param);
+    if (!UpgradeV0Net(original_param, param)) {
+      success = false;
+      LOG(ERROR) << "Warning: had one or more problems upgrading "
+          << "V0NetParameter to NetParameter (see above); continuing anyway.";
+    } else {
+      LOG(INFO) << "Successfully upgraded file specified using deprecated "
+                << "V0LayerParameter";
+    }
+    LOG(WARNING) << "Note that future Caffe releases will not support "
+        << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
+        << "prototxt and ./build/tools/upgrade_net_proto_binary for model "
+        << "weights upgrade this and any other net protos to the new format.";
+  }
+  // NetParameter uses old style data transformation fields; try to upgrade it.
+  if (NetNeedsDataUpgrade(*param)) {
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "transformation parameters: " << param_file;
+    UpgradeNetDataTransformation(param);
+    LOG(INFO) << "Successfully upgraded file specified using deprecated "
+              << "data transformation parameters.";
+    LOG(WARNING) << "Note that future Caffe releases will only support "
+                 << "transform_param messages for transformation fields.";
+  }
+  if (NetNeedsV1ToV2Upgrade(*param)) {
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "V1LayerParameter: " << param_file;
+    NetParameter original_param(*param);
+    if (!UpgradeV1Net(original_param, param)) {
+      success = false;
+      LOG(ERROR) << "Warning: had one or more problems upgrading "
+                 << "V1LayerParameter (see above); continuing anyway.";
+    } else {
+      LOG(INFO) << "Successfully upgraded file specified using deprecated "
+                << "V1LayerParameter";
+    }
+  }
+  return success;
+}
+
+void ReadNetParamsFromTextFileOrDie(const string& param_file,
+                                    NetParameter* param) {
+  CHECK(ReadProtoFromTextFile(param_file, param))
+      << "Failed to parse NetParameter file: " << param_file;
+  UpgradeNetAsNeeded(param_file, param);
+}
+
+void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
+                                      NetParameter* param) {
+  CHECK(ReadProtoFromBinaryFile(param_file, param))
+      << "Failed to parse NetParameter file: " << param_file;
+  UpgradeNetAsNeeded(param_file, param);
+}
+
 bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param) {
   for (int i = 0; i < net_param.layers_size(); ++i) {
     if (net_param.layers(i).has_layer()) {
@@ -583,53 +644,6 @@ void UpgradeNetDataTransformation(NetParameter* net_param) {
   }
 }
 
-bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
-  bool success = true;
-  if (NetNeedsV0ToV1Upgrade(*param)) {
-    // NetParameter was specified using the old style (V0LayerParameter); try to
-    // upgrade it.
-    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
-              << "V0LayerParameter: " << param_file;
-    NetParameter original_param(*param);
-    if (!UpgradeV0Net(original_param, param)) {
-      success = false;
-      LOG(ERROR) << "Warning: had one or more problems upgrading "
-          << "V0NetParameter to NetParameter (see above); continuing anyway.";
-    } else {
-      LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V0LayerParameter";
-    }
-    LOG(WARNING) << "Note that future Caffe releases will not support "
-        << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
-        << "prototxt and ./build/tools/upgrade_net_proto_binary for model "
-        << "weights upgrade this and any other net protos to the new format.";
-  }
-  // NetParameter uses old style data transformation fields; try to upgrade it.
-  if (NetNeedsDataUpgrade(*param)) {
-    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
-              << "transformation parameters: " << param_file;
-    UpgradeNetDataTransformation(param);
-    LOG(INFO) << "Successfully upgraded file specified using deprecated "
-              << "data transformation parameters.";
-    LOG(WARNING) << "Note that future Caffe releases will only support "
-                 << "transform_param messages for transformation fields.";
-  }
-  if (NetNeedsV1ToV2Upgrade(*param)) {
-    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
-              << "V1LayerParameter: " << param_file;
-    NetParameter original_param(*param);
-    if (!UpgradeV1Net(original_param, param)) {
-      success = false;
-      LOG(ERROR) << "Warning: had one or more problems upgrading "
-                 << "V1LayerParameter (see above); continuing anyway.";
-    } else {
-      LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V1LayerParameter";
-    }
-  }
-  return success;
-}
-
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
   bool is_fully_compatible = true;
   if (v1_net_param.layer_size() > 0) {
@@ -923,18 +937,78 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
   }
 }
 
-void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param) {
-  CHECK(ReadProtoFromTextFile(param_file, param))
-      << "Failed to parse NetParameter file: " << param_file;
-  UpgradeNetAsNeeded(param_file, param);
+// Return true iff the solver contains any old solver_type specified as enums
+bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param) {
+  if (solver_param.has_solver_type()) {
+    return true;
+  }
+  return false;
 }
 
-void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param) {
-  CHECK(ReadProtoFromBinaryFile(param_file, param))
-      << "Failed to parse NetParameter file: " << param_file;
-  UpgradeNetAsNeeded(param_file, param);
+bool UpgradeSolverType(SolverParameter* solver_param) {
+  CHECK(!solver_param->has_solver_type() || !solver_param->has_type())
+      << "Failed to upgrade solver: old solver_type field (enum) and new type "
+      << "field (string) cannot be both specified in solver proto text.";
+  if (solver_param->has_solver_type()) {
+    string type;
+    switch (solver_param->solver_type()) {
+    case SolverParameter_SolverType_SGD:
+      type = "SGD";
+      break;
+    case SolverParameter_SolverType_NESTEROV:
+      type = "Nesterov";
+      break;
+    case SolverParameter_SolverType_ADAGRAD:
+      type = "AdaGrad";
+      break;
+    case SolverParameter_SolverType_RMSPROP:
+      type = "RMSProp";
+      break;
+    case SolverParameter_SolverType_ADADELTA:
+      type = "AdaDelta";
+      break;
+    case SolverParameter_SolverType_ADAM:
+      type = "Adam";
+      break;
+    default:
+      LOG(FATAL) << "Unknown SolverParameter solver_type: " << type;
+    }
+    solver_param->set_type(type);
+    solver_param->clear_solver_type();
+  } else {
+    LOG(ERROR) << "Warning: solver type already up to date. ";
+    return false;
+  }
+  return true;
+}
+
+// Check for deprecations and upgrade the SolverParameter as needed.
+bool UpgradeSolverAsNeeded(const string& param_file, SolverParameter* param) {
+  bool success = true;
+  // Try to upgrade old style solver_type enum fields into new string type
+  if (SolverNeedsTypeUpgrade(*param)) {
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "'solver_type' field (enum)': " << param_file;
+    if (!UpgradeSolverType(param)) {
+      success = false;
+      LOG(ERROR) << "Warning: had one or more problems upgrading "
+                 << "SolverType (see above).";
+    } else {
+      LOG(INFO) << "Successfully upgraded file specified using deprecated "
+                << "'solver_type' field (enum) to 'type' field (string).";
+      LOG(WARNING) << "Note that future Caffe releases will only support "
+                   << "'type' field (string) for a solver's type.";
+    }
+  }
+  return success;
+}
+
+// Read parameters from a file into a SolverParameter proto message.
+void ReadSolverParamsFromTextFileOrDie(const string& param_file,
+                                       SolverParameter* param) {
+  CHECK(ReadProtoFromTextFile(param_file, param))
+      << "Failed to parse SolverParameter file: " << param_file;
+  UpgradeSolverAsNeeded(param_file, param);
 }
 
 }  // namespace caffe
index e3f684b..305cfc3 100644 (file)
@@ -157,7 +157,7 @@ int train() {
       "but not both.";
 
   caffe::SolverParameter solver_param;
-  caffe::ReadProtoFromTextFileOrDie(FLAGS_solver, &solver_param);
+  caffe::ReadSolverParamsFromTextFileOrDie(FLAGS_solver, &solver_param);
 
   // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
@@ -194,7 +194,7 @@ int train() {
         GetRequestedAction(FLAGS_sighup_effect));
 
   shared_ptr<caffe::Solver<float> >
-    solver(caffe::GetSolver<float>(solver_param));
+      solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
 
   solver->SetActionFunction(signal_handler.GetActionFunction());
 
diff --git a/tools/upgrade_solver_proto_text.cpp b/tools/upgrade_solver_proto_text.cpp
new file mode 100644 (file)
index 0000000..7130232
--- /dev/null
@@ -0,0 +1,50 @@
+// This is a script to upgrade old solver prototxts to the new format.
+// Usage:
+//    upgrade_solver_proto_text old_solver_proto_file_in solver_proto_file_out
+
+#include <cstring>
+#include <fstream>  // NOLINT(readability/streams)
+#include <iostream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "caffe/caffe.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/upgrade_proto.hpp"
+
+using std::ofstream;
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+int main(int argc, char** argv) {
+  ::google::InitGoogleLogging(argv[0]);
+  if (argc != 3) {
+    LOG(ERROR) << "Usage: upgrade_solver_proto_text "
+        << "old_solver_proto_file_in solver_proto_file_out";
+    return 1;
+  }
+
+  SolverParameter solver_param;
+  string input_filename(argv[1]);
+  if (!ReadProtoFromTextFile(input_filename, &solver_param)) {
+    LOG(ERROR) << "Failed to parse input text file as SolverParameter: "
+               << input_filename;
+    return 2;
+  }
+  bool need_upgrade = SolverNeedsTypeUpgrade(solver_param);
+  bool success = true;
+  if (need_upgrade) {
+    success = UpgradeSolverAsNeeded(input_filename, &solver_param);
+    if (!success) {
+      LOG(ERROR) << "Encountered error(s) while upgrading prototxt; "
+                 << "see details above.";
+    }
+  } else {
+    LOG(ERROR) << "File already in latest proto format: " << input_filename;
+  }
+
+  // Save new format prototxt.
+  WriteProtoToTextFile(solver_param, argv[2]);
+
+  LOG(ERROR) << "Wrote upgraded SolverParameter text proto to " << argv[2];
+  return !success;
+}