}
template <typename Dtype>
-void AdaDeltaSolver<Dtype>::PreSolve() {
+void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
// Add the extra history entries for AdaDelta after those from
// SGDSolver::PreSolve
- const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+ const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
for (int i = 0; i < net_params.size(); ++i) {
const vector<int>& shape = net_params[i]->shape();
this->history_.push_back(
}
template <typename Dtype>
-void AdaDeltaSolver<Dtype>::ComputeUpdateValue() {
- const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
- const vector<float>& net_params_weight_decay =
- this->net_->params_weight_decay();
+void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+ const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+ const vector<float>& net_params_lr = this->net_->params_lr();
Dtype delta = this->param_.delta();
Dtype momentum = this->param_.momentum();
- Dtype weight_decay = this->param_.weight_decay();
- string regularization_type = this->param_.regularization_type();
+ Dtype local_rate = rate * net_params_lr[param_id];
size_t update_history_offset = net_params.size();
switch (Caffe::mode()) {
- case Caffe::CPU:
- for (int param_id = 0; param_id < net_params.size(); ++param_id) {
- Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
- if (local_decay) {
- if (regularization_type == "L2") {
- // add weight decay
- caffe_axpy(net_params[param_id]->count(),
- local_decay,
- net_params[param_id]->cpu_data(),
- net_params[param_id]->mutable_cpu_diff());
- } else if (regularization_type == "L1") {
- caffe_cpu_sign(net_params[param_id]->count(),
- net_params[param_id]->cpu_data(),
- this->temp_[param_id]->mutable_cpu_data());
- caffe_axpy(net_params[param_id]->count(),
- local_decay,
- this->temp_[param_id]->cpu_data(),
- net_params[param_id]->mutable_cpu_diff());
- } else {
- LOG(FATAL) << "Unknown regularization type: " << regularization_type;
- }
- }
+ case Caffe::CPU: {
+ // compute square of gradient in update
+ caffe_powx(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_cpu_data());
- // compute square of gradient in update
- caffe_powx(net_params[param_id]->count(),
- net_params[param_id]->cpu_diff(), Dtype(2),
- this->update_[param_id]->mutable_cpu_data());
-
- // update history of gradients
- caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
- this->update_[param_id]->cpu_data(), momentum,
- this->history_[param_id]->mutable_cpu_data());
-
- // add delta to history to guard against dividing by zero later
- caffe_set(net_params[param_id]->count(), delta,
- this->temp_[param_id]->mutable_cpu_data());
-
- caffe_add(net_params[param_id]->count(),
- this->temp_[param_id]->cpu_data(),
- this->history_[update_history_offset + param_id]->cpu_data(),
- this->update_[param_id]->mutable_cpu_data());
-
- caffe_add(net_params[param_id]->count(),
- this->temp_[param_id]->cpu_data(),
- this->history_[param_id]->cpu_data(),
- this->temp_[param_id]->mutable_cpu_data());
-
- // divide history of updates by history of gradients
- caffe_div(net_params[param_id]->count(),
- this->update_[param_id]->cpu_data(),
- this->temp_[param_id]->cpu_data(),
- this->update_[param_id]->mutable_cpu_data());
-
- // jointly compute the RMS of both for update and gradient history
- caffe_powx(net_params[param_id]->count(),
- this->update_[param_id]->cpu_data(), Dtype(0.5),
- this->update_[param_id]->mutable_cpu_data());
-
- // compute the update
- caffe_mul(net_params[param_id]->count(),
- net_params[param_id]->cpu_diff(),
- this->update_[param_id]->cpu_data(),
- net_params[param_id]->mutable_cpu_diff());
-
- // compute square of update
- caffe_powx(net_params[param_id]->count(),
- net_params[param_id]->cpu_diff(), Dtype(2),
- this->update_[param_id]->mutable_cpu_data());
-
- // update history of updates
- caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
- this->update_[param_id]->cpu_data(), momentum,
- this->history_[update_history_offset + param_id]->mutable_cpu_data());
- }
+ // update history of gradients
+ caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->cpu_data(), momentum,
+ this->history_[param_id]->mutable_cpu_data());
+
+ // add delta to history to guard against dividing by zero later
+ caffe_set(net_params[param_id]->count(), delta,
+ this->temp_[param_id]->mutable_cpu_data());
+
+ caffe_add(net_params[param_id]->count(),
+ this->temp_[param_id]->cpu_data(),
+ this->history_[update_history_offset + param_id]->cpu_data(),
+ this->update_[param_id]->mutable_cpu_data());
+
+ caffe_add(net_params[param_id]->count(),
+ this->temp_[param_id]->cpu_data(),
+ this->history_[param_id]->cpu_data(),
+ this->temp_[param_id]->mutable_cpu_data());
+
+ // divide history of updates by history of gradients
+ caffe_div(net_params[param_id]->count(),
+ this->update_[param_id]->cpu_data(),
+ this->temp_[param_id]->cpu_data(),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // jointly compute the RMS of both for update and gradient history
+ caffe_powx(net_params[param_id]->count(),
+ this->update_[param_id]->cpu_data(), Dtype(0.5),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // compute the update
+ caffe_mul(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(),
+ this->update_[param_id]->cpu_data(),
+ net_params[param_id]->mutable_cpu_diff());
+
+ // compute square of update
+ caffe_powx(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // update history of updates
+ caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->cpu_data(), momentum,
+ this->history_[update_history_offset + param_id]->mutable_cpu_data());
+
+ // apply learning rate
+ caffe_cpu_scale(net_params[param_id]->count(), local_rate,
+ net_params[param_id]->cpu_diff(),
+ net_params[param_id]->mutable_cpu_diff());
break;
- case Caffe::GPU:
+ }
+ case Caffe::GPU: {
#ifndef CPU_ONLY
- for (int param_id = 0; param_id < net_params.size(); ++param_id) {
- Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
- if (local_decay) {
- if (regularization_type == "L2") {
- // add weight decay
- caffe_gpu_axpy(net_params[param_id]->count(),
- local_decay,
- net_params[param_id]->gpu_data(),
- net_params[param_id]->mutable_gpu_diff());
- } else if (regularization_type == "L1") {
- caffe_gpu_sign(net_params[param_id]->count(),
- net_params[param_id]->gpu_data(),
- this->temp_[param_id]->mutable_gpu_data());
- caffe_gpu_axpy(net_params[param_id]->count(),
- local_decay,
- this->temp_[param_id]->gpu_data(),
- net_params[param_id]->mutable_gpu_diff());
- } else {
- LOG(FATAL) << "Unknown regularization type: " << regularization_type;
- }
- }
+ // compute square of gradient in update
+ caffe_gpu_powx(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_gpu_data());
- // compute square of gradient in update
- caffe_gpu_powx(net_params[param_id]->count(),
- net_params[param_id]->gpu_diff(), Dtype(2),
- this->update_[param_id]->mutable_gpu_data());
-
- // update history of gradients
- caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
- this->update_[param_id]->gpu_data(), momentum,
- this->history_[param_id]->mutable_gpu_data());
-
- // add delta to history to guard against dividing by zero later
- caffe_gpu_set(net_params[param_id]->count(), delta,
- this->temp_[param_id]->mutable_gpu_data());
-
- caffe_gpu_add(net_params[param_id]->count(),
- this->temp_[param_id]->gpu_data(),
- this->history_[update_history_offset + param_id]->gpu_data(),
- this->update_[param_id]->mutable_gpu_data());
-
- caffe_gpu_add(net_params[param_id]->count(),
- this->temp_[param_id]->gpu_data(),
- this->history_[param_id]->gpu_data(),
- this->temp_[param_id]->mutable_gpu_data());
-
- // divide history of updates by history of gradients
- caffe_gpu_div(net_params[param_id]->count(),
- this->update_[param_id]->gpu_data(),
- this->temp_[param_id]->gpu_data(),
- this->update_[param_id]->mutable_gpu_data());
-
- // jointly compute the RMS of both for update and gradient history
- caffe_gpu_powx(net_params[param_id]->count(),
- this->update_[param_id]->gpu_data(), Dtype(0.5),
- this->update_[param_id]->mutable_gpu_data());
-
- // compute the update and copy to net_diff
- caffe_gpu_mul(net_params[param_id]->count(),
- net_params[param_id]->gpu_diff(),
- this->update_[param_id]->gpu_data(),
- net_params[param_id]->mutable_gpu_diff());
-
- // compute square of update
- caffe_gpu_powx(net_params[param_id]->count(),
- net_params[param_id]->gpu_diff(), Dtype(2),
- this->update_[param_id]->mutable_gpu_data());
-
- // update history of updates
- caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
- this->update_[param_id]->gpu_data(), momentum,
- this->history_[update_history_offset + param_id]->mutable_gpu_data());
- }
+ // update history of gradients
+ caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->gpu_data(), momentum,
+ this->history_[param_id]->mutable_gpu_data());
+
+ // add delta to history to guard against dividing by zero later
+ caffe_gpu_set(net_params[param_id]->count(), delta,
+ this->temp_[param_id]->mutable_gpu_data());
+
+ caffe_gpu_add(net_params[param_id]->count(),
+ this->temp_[param_id]->gpu_data(),
+ this->history_[update_history_offset + param_id]->gpu_data(),
+ this->update_[param_id]->mutable_gpu_data());
+
+ caffe_gpu_add(net_params[param_id]->count(),
+ this->temp_[param_id]->gpu_data(),
+ this->history_[param_id]->gpu_data(),
+ this->temp_[param_id]->mutable_gpu_data());
+
+ // divide history of updates by history of gradients
+ caffe_gpu_div(net_params[param_id]->count(),
+ this->update_[param_id]->gpu_data(),
+ this->temp_[param_id]->gpu_data(),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // jointly compute the RMS of both for update and gradient history
+ caffe_gpu_powx(net_params[param_id]->count(),
+ this->update_[param_id]->gpu_data(), Dtype(0.5),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // compute the update and copy to net_diff
+ caffe_gpu_mul(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(),
+ this->update_[param_id]->gpu_data(),
+ net_params[param_id]->mutable_gpu_diff());
+
+ // compute square of update
+ caffe_gpu_powx(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // update history of updates
+ caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->gpu_data(), momentum,
+ this->history_[update_history_offset + param_id]->mutable_gpu_data());
+
+ // apply learning rate
+ caffe_gpu_scale(net_params[param_id]->count(), local_rate,
+ net_params[param_id]->gpu_diff(),
+ net_params[param_id]->mutable_gpu_diff());
#else
NO_GPU;
#endif
break;
+ }
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
" bottom: 'targets' "
" } "
"} ";
- if (learning_rate != 0) {
- proto << "base_lr: " << learning_rate << " ";
- proto << "lr_policy: 'fixed' ";
- }
if (weight_decay != 0) {
proto << "weight_decay: " << weight_decay << " ";
}
}
template <typename TypeParam>
+class AdaDeltaSolverTest : public GradientBasedSolverTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ virtual void InitSolver(const SolverParameter& param) {
+ this->solver_.reset(new AdaDeltaSolver<Dtype>(param));
+ }
+
+ virtual SolverParameter_SolverType solver_type() {
+ return SolverParameter_SolverType_ADADELTA;
+ }
+};
+
+TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices);
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ this->TestLeastSquaresUpdate(kLearningRate);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.5;
+ const Dtype kMomentum = 0.95;
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.0;
+ const Dtype kMomentum = 0.5;
+ const int kNumIters = 1;
+ for (int i = 0; i <= kNumIters; ++i) {
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.0;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 1;
+ for (int i = 0; i <= kNumIters; ++i) {
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.0;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ for (int i = 0; i <= kNumIters; ++i) {
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ for (int i = 0; i <= kNumIters; ++i) {
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest,
+ TestAdaDeltaLeastSquaresUpdateWithEverythingShare) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ this->share_ = true;
+ for (int i = 0; i <= kNumIters; ++i) {
+ this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ const int kIterSize = 2;
+ this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+ kIterSize);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ const int kIterSize = 2;
+ this->share_ = true;
+ this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+ kIterSize);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestSnapshot) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ for (int i = 1; i <= kNumIters; ++i) {
+ this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+ }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestSnapshotShare) {
+ typedef typename TypeParam::Dtype Dtype;
+ const Dtype kLearningRate = 1.0;
+ const Dtype kWeightDecay = 0.1;
+ const Dtype kMomentum = 0.95;
+ const int kNumIters = 4;
+ this->share_ = true;
+ for (int i = 1; i <= kNumIters; ++i) {
+ this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+ }
+}
+
+template <typename TypeParam>
class RMSPropSolverTest : public GradientBasedSolverTest<TypeParam> {
typedef typename TypeParam::Dtype Dtype;
}
}
-template <typename TypeParam>
-class AdaDeltaSolverTest : public GradientBasedSolverTest<TypeParam> {
- typedef typename TypeParam::Dtype Dtype;
-
- protected:
- virtual void InitSolver(const SolverParameter& param) {
- this->solver_.reset(new AdaDeltaSolver<Dtype>(param));
- }
-
- virtual SolverParameter_SolverType solver_type() {
- return SolverParameter_SolverType_ADADELTA;
- }
-};
-
-TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- this->TestLeastSquaresUpdate(kLearningRate);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- const Dtype kWeightDecay = 0.5;
- const Dtype kMomentum = 0.95;
- this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- const Dtype kWeightDecay = 0.0;
- const Dtype kMomentum = 0.5;
- const int kNumIters = 1;
- for (int i = 0; i <= kNumIters; ++i) {
- this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
- }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- const Dtype kWeightDecay = 0.0;
- const Dtype kMomentum = 0.95;
- const int kNumIters = 1;
- for (int i = 0; i <= kNumIters; ++i) {
- this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
- }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- const Dtype kWeightDecay = 0.0;
- const Dtype kMomentum = 0.95;
- const int kNumIters = 4;
- for (int i = 0; i <= kNumIters; ++i) {
- this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
- }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) {
- typedef typename TypeParam::Dtype Dtype;
- const Dtype kLearningRate = 0.0;
- const Dtype kWeightDecay = 0.1;
- const Dtype kMomentum = 0.95;
- const int kNumIters = 4;
- for (int i = 0; i <= kNumIters; ++i) {
- this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
- }
-}
-
} // namespace caffe