From e05b0887aac245f38fb3d98936fc6e749c59c362 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Mon, 14 Oct 2013 10:50:32 -0700 Subject: [PATCH] added custom learning rate for individual blobs --- src/caffe/net.cpp | 18 +++++-- src/caffe/net.hpp | 7 ++- src/caffe/optimization/solver.cpp | 16 +++--- src/caffe/proto/caffe.proto | 3 ++ src/caffe/test/test_solver_linear_regression.cpp | 65 +++++++++++++++++++++--- src/programs/dump_network.cpp | 43 +++++++++++----- src/programs/imagenet.prototxt | 20 +++++++- 7 files changed, 138 insertions(+), 34 deletions(-) diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 8190d8e..ff1cca4 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -85,7 +85,7 @@ Net::Net(const NetParameter& param, net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get()); } - LOG(ERROR) << "Setting up the layers."; + LOG(INFO) << "Setting up the layers."; for (int i = 0; i < layers_.size(); ++i) { LOG(INFO) << "Setting up " << layer_names_[i]; layers_[i]->SetUp(bottom_vecs_[i], &top_vecs_[i]); @@ -93,14 +93,26 @@ Net::Net(const NetParameter& param, for (int j = 0; j < layer_blobs.size(); ++j) { params_.push_back(layer_blobs[j]); } + // push the learning rate mutlipliers + if (layers_[i]->layer_param().blobs_lr_size()) { + CHECK_EQ(layers_[i]->layer_param().blobs_lr_size(), layer_blobs.size()); + for (int j = 0; j < layer_blobs.size(); ++j) { + float local_lr = layers_[i]->layer_param().blobs_lr(j); + CHECK_GT(local_lr, 0.); + params_lr_.push_back(local_lr); + } + } else { + for (int j = 0; j < layer_blobs.size(); ++j) { + params_lr_.push_back(1.); + } + } for (int topid = 0; topid < top_vecs_[i].size(); ++topid) { LOG(INFO) << "Top shape: " << top_vecs_[i][topid]->channels() << " " << top_vecs_[i][topid]->height() << " " << top_vecs_[i][topid]->width(); } } - - LOG(ERROR) << "Network initialization done."; + LOG(INFO) << "Network initialization done."; } template diff --git a/src/caffe/net.hpp b/src/caffe/net.hpp index 24bef4b..c27442b 100644 --- a/src/caffe/net.hpp +++ b/src/caffe/net.hpp @@ -57,7 +57,9 @@ class Net { inline vector*> >& bottom_vecs() { return bottom_vecs_; } inline vector*> >& top_vecs() { return top_vecs_; } // returns the parameters - vector > >& params() { return params_; } + inline vector > >& params() { return params_; } + // returns the parameter learning rate multipliers + inline vector& params_lr() {return params_lr_; } // Updates the network void Update(); @@ -82,7 +84,8 @@ class Net { string name_; // The parameters in the network. vector > > params_; - + // the learning rate multipliers + vector params_lr_; DISABLE_COPY_AND_ASSIGN(Net); }; diff --git a/src/caffe/optimization/solver.cpp b/src/caffe/optimization/solver.cpp index fb38237..b2a5760 100644 --- a/src/caffe/optimization/solver.cpp +++ b/src/caffe/optimization/solver.cpp @@ -17,7 +17,6 @@ using std::min; namespace caffe { - template void Solver::Solve(Net* net) { net_ = net; @@ -112,6 +111,7 @@ void SGDSolver::PreSolve() { template void SGDSolver::ComputeUpdateValue() { vector > >& net_params = this->net_->params(); + vector& net_params_lr = this->net_->params_lr(); // get the learning rate Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { @@ -120,17 +120,19 @@ void SGDSolver::ComputeUpdateValue() { Dtype momentum = this->param_.momentum(); Dtype weight_decay = this->param_.weight_decay(); // LOG(ERROR) << "rate:" << rate << " momentum:" << momentum - // << " weight_decay:" << weight_decay; + // << " weight_decay:" << weight_decay; switch (Caffe::mode()) { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - caffe_axpby(net_params[param_id]->count(), rate, + Dtype local_rate = rate * net_params_lr[param_id]; + caffe_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); if (weight_decay) { // add weight decay - caffe_axpy(net_params[param_id]->count(), weight_decay * rate, + caffe_axpy(net_params[param_id]->count(), + weight_decay * local_rate, net_params[param_id]->cpu_data(), history_[param_id]->mutable_cpu_data()); } @@ -143,12 +145,14 @@ void SGDSolver::ComputeUpdateValue() { case Caffe::GPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - caffe_gpu_axpby(net_params[param_id]->count(), rate, + Dtype local_rate = rate * net_params_lr[param_id]; + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), momentum, history_[param_id]->mutable_gpu_data()); if (weight_decay) { // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), weight_decay * rate, + caffe_gpu_axpy(net_params[param_id]->count(), + weight_decay * local_rate, net_params[param_id]->gpu_data(), history_[param_id]->mutable_gpu_data()); } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index afefcca..87f2c2c 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -71,6 +71,9 @@ message LayerParameter { // The blobs containing the numeric parameters of the layer repeated BlobProto blobs = 50; + // The ratio that is multiplied on the global learning rate. If you want to set + // the learning ratio for one blob, you need to set it for all blobs. + repeated float blobs_lr = 51; } message LayerConnection { diff --git a/src/caffe/test/test_solver_linear_regression.cpp b/src/caffe/test/test_solver_linear_regression.cpp index fbd53f3..8fd504b 100644 --- a/src/caffe/test/test_solver_linear_regression.cpp +++ b/src/caffe/test/test_solver_linear_regression.cpp @@ -26,7 +26,7 @@ class SolverTest : public ::testing::Test {}; typedef ::testing::Types Dtypes; TYPED_TEST_CASE(SolverTest, Dtypes); -TYPED_TEST(SolverTest, TestSolve) { +TYPED_TEST(SolverTest, TestSolveGPU) { Caffe::set_mode(Caffe::GPU); NetParameter net_param; @@ -41,10 +41,10 @@ TYPED_TEST(SolverTest, TestSolve) { EXPECT_EQ(caffe_net.blob_names().size(), 3); // Run the network without training. - LOG(ERROR) << "Performing Forward"; + LOG(INFO) << "Performing Forward"; caffe_net.Forward(bottom_vec); - LOG(ERROR) << "Performing Backward"; - LOG(ERROR) << "Initial loss: " << caffe_net.Backward(); + LOG(INFO) << "Performing Backward"; + LOG(INFO) << "Initial loss: " << caffe_net.Backward(); SolverParameter solver_param; solver_param.set_base_lr(0.1); @@ -55,13 +55,62 @@ TYPED_TEST(SolverTest, TestSolve) { solver_param.set_power(0.75); solver_param.set_momentum(0.9); - LOG(ERROR) << "Starting Optimization"; + LOG(INFO) << "Starting Optimization"; SGDSolver solver(solver_param); solver.Solve(&caffe_net); - LOG(ERROR) << "Optimization Done."; - LOG(ERROR) << "Weight: " << caffe_net.params()[0]->cpu_data()[0] << ", " + LOG(INFO) << "Optimization Done."; + LOG(INFO) << "Weight: " << caffe_net.params()[0]->cpu_data()[0] << ", " << caffe_net.params()[0]->cpu_data()[1]; - LOG(ERROR) << "Bias: " << caffe_net.params()[1]->cpu_data()[0]; + LOG(INFO) << "Bias: " << caffe_net.params()[1]->cpu_data()[0]; + + EXPECT_GE(caffe_net.params()[0]->cpu_data()[0], 0.3); + EXPECT_LE(caffe_net.params()[0]->cpu_data()[0], 0.35); + + EXPECT_GE(caffe_net.params()[0]->cpu_data()[1], 0.3); + EXPECT_LE(caffe_net.params()[0]->cpu_data()[1], 0.35); + + EXPECT_GE(caffe_net.params()[1]->cpu_data()[0], -0.01); + EXPECT_LE(caffe_net.params()[1]->cpu_data()[0], 0.01); +} + + + +TYPED_TEST(SolverTest, TestSolveCPU) { + Caffe::set_mode(Caffe::CPU); + + NetParameter net_param; + ReadProtoFromTextFile("data/linear_regression.prototxt", + &net_param); + // check if things are right + EXPECT_EQ(net_param.layers_size(), 3); + EXPECT_EQ(net_param.input_size(), 0); + vector*> bottom_vec; + Net caffe_net(net_param, bottom_vec); + EXPECT_EQ(caffe_net.layer_names().size(), 3); + EXPECT_EQ(caffe_net.blob_names().size(), 3); + + // Run the network without training. + LOG(INFO) << "Performing Forward"; + caffe_net.Forward(bottom_vec); + LOG(INFO) << "Performing Backward"; + LOG(INFO) << "Initial loss: " << caffe_net.Backward(); + + SolverParameter solver_param; + solver_param.set_base_lr(0.1); + solver_param.set_display(0); + solver_param.set_max_iter(100); + solver_param.set_lr_policy("inv"); + solver_param.set_gamma(1.); + solver_param.set_power(0.75); + solver_param.set_momentum(0.9); + + LOG(INFO) << "Starting Optimization"; + SGDSolver solver(solver_param); + solver.Solve(&caffe_net); + LOG(INFO) << "Optimization Done."; + LOG(INFO) << "Weight: " << caffe_net.params()[0]->cpu_data()[0] << ", " + << caffe_net.params()[0]->cpu_data()[1]; + LOG(INFO) << "Bias: " << caffe_net.params()[1]->cpu_data()[0]; EXPECT_GE(caffe_net.params()[0]->cpu_data()[0], 0.3); EXPECT_LE(caffe_net.params()[0]->cpu_data()[0], 0.35); diff --git a/src/programs/dump_network.cpp b/src/programs/dump_network.cpp index 3507100..8dd8b0d 100644 --- a/src/programs/dump_network.cpp +++ b/src/programs/dump_network.cpp @@ -4,7 +4,10 @@ // all the intermediate blobs produced by the net to individual binary // files stored in protobuffer binary formats. // Usage: -// dump_network input_net_param trained_net_param input_blob output_prefix +// dump_network input_net_param trained_net_param input_blob output_prefix 0/1 +// if input_net_param is 'none', we will directly load the network from +// trained_net_param. If the last argv is 1, we will do a forward-backward pass +// before dumping everyting, and also dump the who network. #include #include @@ -29,26 +32,41 @@ int main(int argc, char** argv) { NetParameter net_param; NetParameter trained_net_param; - ReadProtoFromTextFile(argv[1], &net_param); - ReadProtoFromBinaryFile(argv[2], &trained_net_param); - BlobProto input_blob_proto; - ReadProtoFromBinaryFile(argv[3], &input_blob_proto); - shared_ptr > input_blob(new Blob()); - input_blob->FromProto(input_blob_proto); + if (strcmp(argv[1], "none") == 0) { + // We directly load the net param from trained file + ReadProtoFromBinaryFile(argv[2], &net_param); + } else { + ReadProtoFromTextFile(argv[1], &net_param); + } + ReadProtoFromBinaryFile(argv[2], &trained_net_param); + vector* > input_vec; - input_vec.push_back(input_blob.get()); - // For implementational reasons, we need to first set up the net, and - // then copy the trained parameters. + if (strcmp(argv[3], "none") != 0) { + BlobProto input_blob_proto; + ReadProtoFromBinaryFile(argv[3], &input_blob_proto); + shared_ptr > input_blob(new Blob()); + input_blob->FromProto(input_blob_proto); + input_vec.push_back(input_blob.get()); + } + shared_ptr > caffe_net(new Net(net_param, input_vec)); caffe_net->CopyTrainedLayersFrom(trained_net_param); + string output_prefix(argv[4]); // Run the network without training. LOG(ERROR) << "Performing Forward"; caffe_net->Forward(input_vec); - + if (argc > 4 && strcmp(argv[4], "1")) { + LOG(ERROR) << "Performing Backward"; + caffe_net->Backward(); + // Dump the network + NetParameter output_net_param; + caffe_net->ToProto(&output_net_param, true); + WriteProtoToBinaryFile(output_net_param, output_prefix + output_net_param.name()); + } // Now, let's dump all the layers - string output_prefix(argv[4]); + const vector& blob_names = caffe_net->blob_names(); const vector > >& blobs = caffe_net->blobs(); for (int blobid = 0; blobid < caffe_net->blobs().size(); ++blobid) { @@ -59,6 +77,5 @@ int main(int argc, char** argv) { WriteProtoToBinaryFile(output_blob_proto, output_prefix + blob_names[blobid]); } - // Dump results. return 0; } diff --git a/src/programs/imagenet.prototxt b/src/programs/imagenet.prototxt index d5e9b92..53295de 100644 --- a/src/programs/imagenet.prototxt +++ b/src/programs/imagenet.prototxt @@ -4,10 +4,10 @@ layers { name: "data" type: "data" source: "/home/jiayq/caffe-train-leveldb" - batchsize: 96 + batchsize: 128 subtraction: 114 cropsize: 227 - mirror: false + mirror: true } top: "data" top: "label" @@ -27,6 +27,8 @@ layers { type: "constant" value: 0 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "data" top: "conv1" @@ -85,6 +87,8 @@ layers { type: "constant" value: 1 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "pad2" top: "conv2" @@ -142,6 +146,8 @@ layers { type: "constant" value: 0 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "pad3" top: "conv3" @@ -178,6 +184,8 @@ layers { type: "constant" value: 1 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "pad4" top: "conv4" @@ -214,6 +222,8 @@ layers { type: "constant" value: 1 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "pad5" top: "conv5" @@ -250,6 +260,8 @@ layers { type: "constant" value: 1 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "pool5" top: "fc6" @@ -284,6 +296,8 @@ layers { type: "constant" value: 1 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "drop6" top: "fc7" @@ -318,6 +332,8 @@ layers { type: "constant" value: 0 } + blobs_lr: 1. + blobs_lr: 2. } bottom: "drop7" top: "fc8" -- 2.7.4