From: Evan Shelhamer Date: Fri, 22 May 2015 01:14:16 +0000 (-0700) Subject: test equivalence of solving with accumulating gradients X-Git-Tag: submit/tizen/20180823.020014~477^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=92ab737adad6d686ac75cdf934472f6a97b52fe7;p=platform%2Fupstream%2Fcaffeonacl.git test equivalence of solving with accumulating gradients Compare the parameters after solving with a given batch size and the halved batch size + two iter accumulation of gradients equivalent. Note: the test net dummy data layer now makes constant data and random gaussian targets. This assures the standard and gradient accumulation cases check the same data. Otherwise the difference in batch sizes causes different orders of random number draws. --- diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp index eb2569c..c9135d6 100644 --- a/src/caffe/test/test_gradient_based_solver.cpp +++ b/src/caffe/test/test_gradient_based_solver.cpp @@ -23,7 +23,7 @@ class GradientBasedSolverTest : public MultiDeviceTest { protected: GradientBasedSolverTest() : - seed_(1701), num_(5), channels_(3), height_(10), width_(10) {} + seed_(1701), num_(4), channels_(3), height_(10), width_(10) {} shared_ptr > solver_; int seed_; @@ -56,19 +56,21 @@ class GradientBasedSolverTest : public MultiDeviceTest { } void RunLeastSquaresSolver(const Dtype learning_rate, - const Dtype weight_decay, const Dtype momentum, const int num_iters) { + const Dtype weight_decay, const Dtype momentum, const int num_iters, + const int iter_size = 1) { ostringstream proto; proto << "max_iter: " << num_iters << " " "base_lr: " << learning_rate << " " "lr_policy: 'fixed' " + "iter_size: " << iter_size << " " "net_param { " " name: 'TestNetwork' " " layer { " " name: 'data' " " type: 'DummyData' " " dummy_data_param { " - " num: " << num_ << " " + " num: " << num_ / iter_size << " " " channels: " << channels_ << " " " height: " << height_ << " " " width: " << width_ << " " @@ -76,6 +78,10 @@ class GradientBasedSolverTest : public MultiDeviceTest { " height: 1 " " width: 1 " " data_filler { " + " type: 'constant' " + " value: 1.0 " + " } " + " data_filler { " " type: 'gaussian' " " std: 1.0 " " } " @@ -270,6 +276,45 @@ class GradientBasedSolverTest : public MultiDeviceTest { } } + void CheckAccumulation(const Dtype kLearningRate, const Dtype kWeightDecay, + const Dtype kMomentum, const int kNumIters, const int kIterSize) { + const double kPrecision = 1e-2; + const double kMinPrecision = 1e-7; + // Solve without accumulation and save parameters. + this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum, + kNumIters); + // Save parameters for comparison. + Net& net = *this->solver_->net(); + const vector > >& param_blobs = + net.layer_by_name("innerprod")->blobs(); + vector > > noaccum_params(param_blobs.size()); + for (int i = 0; i < param_blobs.size(); ++i) { + noaccum_params[i].reset(new Blob()); + noaccum_params[i]->CopyFrom(*param_blobs[i], false, true); + } + // Solve by equivalent accumulation of gradients over divided batches. + this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum, + kNumIters, kIterSize); + Net& net_accum = *this->solver_->net(); + const vector > >& accum_params = + net_accum.layer_by_name("innerprod")->blobs(); + // Compare accumulated parameters against no accumulation standard. + const int D = this->channels_ * this->height_ * this->width_; + for (int i = 0; i < D; ++i) { + const Dtype expected_param = noaccum_params[0]->cpu_data()[i]; + const Dtype accum_param = accum_params[0]->cpu_data()[i]; + const Dtype error_margin = std::max(kMinPrecision, kPrecision * + std::min(fabs(expected_param), fabs(accum_param))); + EXPECT_NEAR(expected_param, accum_param, error_margin); + } + ASSERT_EQ(1, accum_params[1]->count()); + const Dtype expected_bias = noaccum_params[1]->cpu_data()[0]; + const Dtype accum_bias = accum_params[1]->cpu_data()[0]; + const Dtype error_margin = std::max(kMinPrecision, kPrecision * + std::min(fabs(expected_bias), fabs(accum_bias))); + EXPECT_NEAR(expected_bias, accum_bias, error_margin); + } + // Test that the correct update is computed for a regularized least squares // problem: // @@ -372,6 +417,16 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) { } } +TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} template class AdaGradSolverTest : public GradientBasedSolverTest { @@ -416,6 +471,16 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) { } } +TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.0; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} template class NesterovSolverTest : public GradientBasedSolverTest { @@ -482,4 +547,15 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) { } } +TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + } // namespace caffe