From 43fa260ecf3e0f67bfefb8bc89f2c8a2635fa358 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Tue, 11 Feb 2020 15:05:39 +0900 Subject: [PATCH] Decayed Learning Rate Implement Decayed Learning Rate for better convergence **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- include/layers.h | 2 ++ include/neuralnet.h | 10 ++++++++ src/layers.cpp | 22 ++++++++++++----- src/neuralnet.cpp | 5 ++++ test/decayed_learning_rate/Training.ini | 42 +++++++++++++++++++++++++++++++++ 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 test/decayed_learning_rate/Training.ini diff --git a/include/layers.h b/include/layers.h index 42d8fec..ffbae7e 100644 --- a/include/layers.h +++ b/include/layers.h @@ -76,6 +76,8 @@ typedef struct { double beta2; double epsilon; acti_type activation; + float decay_rate; + float decay_steps; } Optimizer; /** diff --git a/include/neuralnet.h b/include/neuralnet.h index 9db5e89..35808fa 100644 --- a/include/neuralnet.h +++ b/include/neuralnet.h @@ -164,6 +164,16 @@ class NeuralNetwork { float learning_rate; /** + * @brief decay_rate for decayed learning rate + */ + float decay_rate; + + /** + * @brief decay_step for decayed learning rate + */ + float decay_steps; + + /** * @brief Maximum Epoch */ unsigned int epoch; diff --git a/src/layers.cpp b/src/layers.cpp index 2852a0a..f82b20b 100644 --- a/src/layers.cpp +++ b/src/layers.cpp @@ -182,23 +182,28 @@ Tensor FullyConnectedLayer::backwarding(Tensor derivative, int iteration) { Tensor dJdW = Input.transpose().dot(dJdB); Tensor ret = dJdB.dot(Weight.transpose()); + float ll = opt.learning_rate; + if (opt.decay_steps != -1) { + ll = opt.learning_rate * pow(opt.decay_rate, (iteration / opt.decay_steps)); + } + switch (opt.type) { case OPT_SGD: - Weight = Weight.subtract(dJdW.average().multiply(opt.learning_rate)); + Weight = Weight.subtract(dJdW.average().multiply(ll)); break; case OPT_ADAM: M = M.multiply(opt.beta1).add(dJdW.average().multiply(1 - opt.beta1)); V = V.multiply(opt.beta2).add((dJdW.average().multiply(dJdW.average())).multiply(1 - opt.beta2)); M.divide(1 - pow(opt.beta1, iteration + 1)); V.divide(1 - pow(opt.beta2, iteration + 1)); - Weight = Weight.subtract((M.divide(V.applyFunction(sqrt_float).add(opt.epsilon))).multiply(opt.learning_rate)); + Weight = Weight.subtract((M.divide(V.applyFunction(sqrt_float).add(opt.epsilon))).multiply(ll)); break; default: break; } if (!this->init_zero) { - Bias = Bias.subtract(dJdB.average().multiply(opt.learning_rate)); + Bias = Bias.subtract(dJdB.average().multiply(ll)); } return ret; @@ -284,6 +289,11 @@ Tensor OutputLayer::backwarding(Tensor label, int iteration) { Tensor ret; Tensor dJdB; + float ll = opt.learning_rate; + if (opt.decay_steps != -1) { + ll = opt.learning_rate * pow(opt.decay_rate, (iteration / opt.decay_steps)); + } + if (cost == COST_ENTROPY) { dJdB = Y.subtract(Y2); Tensor temp = ((Y2.multiply(-1.0).transpose().dot(Y.add(opt.epsilon).applyFunction(log_float))) @@ -308,21 +318,21 @@ Tensor OutputLayer::backwarding(Tensor label, int iteration) { switch (opt.type) { case Layers::OPT_SGD: - Weight = Weight.subtract(dJdW.average().multiply(opt.learning_rate)); + Weight = Weight.subtract(dJdW.average().multiply(ll)); break; case Layers::OPT_ADAM: M = M.multiply(opt.beta1).add(dJdW.average().multiply(1 - opt.beta1)); V = V.multiply(opt.beta2).add((dJdW.average().multiply(dJdW.average())).multiply(1 - opt.beta2)); M.divide(1 - pow(opt.beta1, iteration + 1)); V.divide(1 - pow(opt.beta2, iteration + 1)); - Weight = Weight.subtract((M.divide(V.applyFunction(sqrt_float).add(opt.epsilon))).multiply(opt.learning_rate)); + Weight = Weight.subtract((M.divide(V.applyFunction(sqrt_float).add(opt.epsilon))).multiply(ll)); break; default: break; } if (!this->init_zero) { - Bias = Bias.subtract(dJdB.average().multiply(opt.learning_rate)); + Bias = Bias.subtract(dJdB.average().multiply(ll)); } return ret; diff --git a/src/neuralnet.cpp b/src/neuralnet.cpp index 62cf98f..c580f7d 100644 --- a/src/neuralnet.cpp +++ b/src/neuralnet.cpp @@ -197,7 +197,12 @@ void NeuralNetwork::init() { nettype = (Network::net_type)parseType(iniparser_getstring(ini, "Network:Type", NULL), TOKEN_NET); std::vector layers_name = parseLayerName(iniparser_getstring(ini, "Network:Layers", NULL)); learning_rate = iniparser_getdouble(ini, "Network:Learning_rate", 0.0); + decay_rate = iniparser_getdouble(ini, "Network:Decay_rate", 0.0); + decay_steps = iniparser_getint(ini, "Network:Decay_steps", -1); + opt.learning_rate = learning_rate; + opt.decay_steps = decay_steps; + opt.decay_rate = decay_rate; epoch = iniparser_getint(ini, "Network:Epoch", 100); opt.type = (Layers::opt_type)parseType(iniparser_getstring(ini, "Network:Optimizer", NULL), TOKEN_OPT); opt.activation = (Layers::acti_type)parseType(iniparser_getstring(ini, "Network:Activation", NULL), TOKEN_ACTI); diff --git a/test/decayed_learning_rate/Training.ini b/test/decayed_learning_rate/Training.ini new file mode 100644 index 0000000..70ed705 --- /dev/null +++ b/test/decayed_learning_rate/Training.ini @@ -0,0 +1,42 @@ +# Network Section : Network +[Network] +Type = NeuralNetwork # Network Type : Regression, KNN, NeuralNetwork +Layers = inputlayer \ + fc1layer \ + outputlayer #Layers of Neuralnetwork +Learning_rate = 0.7 # Learning Rate +Decay_rate=0.96 #for the decay_rate for the decayed learning rate +Decay_steps = 1000 #decay step for the exponential decayed learning rate +Epoch = 300 # Epoch +Optimizer = adam # Optimizer : sgd (stochastic gradien decent), + # adam (Adamtive Moment Estimation) +Activation = sigmoid # activation : sigmoid, tanh +Cost = msr # Cost(loss) function : msr (mean square root error) + # categorical ( for logistic regression ) +Model = "model.bin" # model path to save / read +minibatch = 1 # mini batch size +beta1 = 0.9 # beta 1 for adam +beta2 = 0.9999 # beta 2 for adam +epsilon = 1e-8 # epsilon for adam + +# Layer Section : Name +[inputlayer] +Type = InputLayer +Id = 0 # Layer Id +Height = 1 +Width = 128 # Input Layer Dimension +Bias_zero = true # Zero Bias + +[fc1layer] +Type = FullyConnectedLayer +Id = 1 +Height = 128 # Input Dimension ( = Weight Height ) +Width = 20 # Hidden Layer Dimension ( = Weight Width ) +Bias_zero = true + +[outputlayer] +Type = OutputLayer +Id = 3 +Height = 20 # Hidden Layer Dimension ( = Weight Height ) +Width = 3 # Output Layer Dimension ( = Weight Width ) +Bias_zero = true -- 2.7.4