include/caffe/parallel.hpp

   1 #ifndef CAFFE_PARALLEL_HPP_
   2 #define CAFFE_PARALLEL_HPP_
   3
   4 #ifdef USE_NCCL
   5
   6 #include <boost/thread.hpp>
   7
   8 #include <string>
   9 #include <vector>
  10
  11 #include "caffe/blob.hpp"
  12 #include "caffe/common.hpp"
  13 #include "caffe/internal_thread.hpp"
  14 #include "caffe/layer.hpp"
  15 #include "caffe/proto/caffe.pb.h"
  16 #include "caffe/solver.hpp"
  17 #include "caffe/syncedmem.hpp"
  18 #include "caffe/util/blocking_queue.hpp"
  19 #include "caffe/util/nccl.hpp"
  20
  21 namespace caffe {
  22
  23 // Represents a net parameters. Once a net is created, its parameter buffers can
  24 // be replaced by ones from Params, to allow parallelization. Params ensures
  25 // parameters are allocated in one consecutive array.
  26 template<typename Dtype>
  27 class Params {
  28  public:
  29   explicit Params(shared_ptr<Solver<Dtype> > root_solver);
  30   virtual ~Params() {
  31   }
  32
  33   inline size_t size() const {
  34     return size_;
  35   }
  36   inline Dtype* data() const {
  37     return data_;
  38   }
  39   inline Dtype* diff() const {
  40     return diff_;
  41   }
  42
  43  protected:
  44   const size_t size_;           // Size of buffers
  45   Dtype* data_;                 // Network parameters
  46   Dtype* diff_;                 // Gradient
  47
  48 DISABLE_COPY_AND_ASSIGN(Params);
  49 };
  50
  51 // Params stored in GPU memory.
  52 template<typename Dtype>
  53 class GPUParams : public Params<Dtype> {
  54  public:
  55   GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
  56   virtual ~GPUParams();
  57
  58   void Configure(Solver<Dtype>* solver) const;
  59
  60  protected:
  61   using Params<Dtype>::size_;
  62   using Params<Dtype>::data_;
  63   using Params<Dtype>::diff_;
  64 };
  65
  66 template<typename Dtype>
  67 class NCCL : public GPUParams<Dtype>,
  68              public Solver<Dtype>::Callback,
  69              public Net<Dtype>::Callback {
  70  public:
  71   /**
  72    * Single process version.
  73    */
  74   explicit NCCL(shared_ptr<Solver<Dtype> > solver);
  75   /**
  76    * In multi-process settings, first create a NCCL id (new_uid), then
  77    * pass it to each process to create connected instances.
  78    */
  79   NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid);
  80   ~NCCL();
  81
  82   boost::barrier* barrier();
  83   void set_barrier(boost::barrier* value);
  84
  85   /**
  86    * In single process settings, create instances without uids and
  87    * call this to connect them.
  88    */
  89   static void InitSingleProcess(vector<NCCL<Dtype>*>* nccls);
  90
  91   static string new_uid();
  92
  93   /**
  94    * Broadcast weights from rank 0 other solvers.
  95    */
  96   void Broadcast();
  97
  98   /**
  99    * Single process multi-GPU.
 100    */
 101   void Run(const vector<int>& gpus, const char* restore);
 102
 103  protected:
 104   void Init();
 105   void on_start() {}
 106   void run(int layer);  // Net callback
 107   void on_gradients_ready();
 108
 109   ncclComm_t comm_;
 110   cudaStream_t stream_;
 111
 112   shared_ptr<Solver<Dtype> > solver_;
 113   // Should not be necessary, https://github.com/NVIDIA/nccl/issues/37
 114   boost::barrier* barrier_;
 115   using Params<Dtype>::size_;
 116   using Params<Dtype>::data_;
 117   using Params<Dtype>::diff_;
 118 };
 119
 120 }  // namespace caffe
 121
 122 #endif  // USE_NCCL
 123 #endif  // header