From 7036f364a45bc2948e835a4b8ae3b19b3f990bcf Mon Sep 17 00:00:00 2001 From: Jihoon Lee Date: Tue, 7 Jul 2020 21:44:59 +0900 Subject: [PATCH] Add UpdatableParam to manage weights / gradients **Changes proposed in this PR:** - Add `UpdatableParam` - Change `Optimizer::apply_gradient` signature - Attach `UpdatableParam` to manage weights **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Jihoon Lee --- nntrainer/include/bn_layer.h | 25 +------- nntrainer/include/conv2d_layer.h | 5 +- nntrainer/include/fc_layer.h | 2 - nntrainer/include/layer.h | 86 ++++++++++++++++++--------- nntrainer/include/optimizer.h | 19 ++++-- nntrainer/src/bn_layer.cpp | 66 +++++++++------------ nntrainer/src/conv2d_layer.cpp | 124 +++++++++++++++++++-------------------- nntrainer/src/fc_layer.cpp | 51 ++++++++-------- nntrainer/src/layer.cpp | 28 ++++++--- nntrainer/src/optimizer.cpp | 24 ++++---- 10 files changed, 219 insertions(+), 211 deletions(-) diff --git a/nntrainer/include/bn_layer.h b/nntrainer/include/bn_layer.h index 2311616..f6bf411 100644 --- a/nntrainer/include/bn_layer.h +++ b/nntrainer/include/bn_layer.h @@ -50,18 +50,6 @@ public: ~BatchNormalizationLayer(){}; /** - * @brief Read Weight & Bias Data from file - * @param[in] file input stream file - */ - void read(std::ifstream &file); - - /** - * @brief Save Weight & Bias Data to file - * @param[in] file output stream file - */ - void save(std::ofstream &file); - - /** * @brief forward propagation with input * @param[in] in Input Tensor from upper layer * @retval normalized input tensor using scaling factor @@ -125,20 +113,9 @@ public: std::string getBaseName() { return "BatchNormalization"; }; private: - Tensor weight; - Tensor bias; - - Tensor mu; /**< moving mu used for inferencing. - momentum * mu + (1 - momenutm) * mu - of current batch is used */ - Tensor var; /**< moving var used for inferencing. - momentum * var + (1 - momenutm) * var - of current batch is used */ - Tensor cvar; /**< training varaince saved in bn_layer::forwarding and used in bn_layer::backwarding */ - Tensor gamma; - Tensor beta; + Tensor x_normalized; float epsilon; }; diff --git a/nntrainer/include/conv2d_layer.h b/nntrainer/include/conv2d_layer.h index fa9f880..88ba623 100644 --- a/nntrainer/include/conv2d_layer.h +++ b/nntrainer/include/conv2d_layer.h @@ -173,10 +173,7 @@ private: unsigned int kernel_size[CONV2D_DIM]; unsigned int stride[CONV2D_DIM]; unsigned int padding[CONV2D_DIM]; - std::vector filters; - std::vector delK; - std::vector bias; - std::vector delBias; + bool normalization; bool standardization; }; diff --git a/nntrainer/include/fc_layer.h b/nntrainer/include/fc_layer.h index e639d37..a4e3d91 100644 --- a/nntrainer/include/fc_layer.h +++ b/nntrainer/include/fc_layer.h @@ -109,8 +109,6 @@ public: private: unsigned int unit; - Tensor weight; - Tensor bias; }; } // namespace nntrainer diff --git a/nntrainer/include/layer.h b/nntrainer/include/layer.h index de02a38..d9bfc93 100644 --- a/nntrainer/include/layer.h +++ b/nntrainer/include/layer.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -130,7 +131,8 @@ public: weight_decay(), weight_ini_type(WEIGHT_XAVIER_UNIFORM), flatten(false), - trainable(true) {} + trainable(true), + param_size(0) {} /** * @brief Destructor of Layer Class @@ -163,15 +165,17 @@ public: /** * @brief read layer Weight & Bias data from file + * @note derived class can call this to get/save updatableParams * @param[in] file input file stream */ - virtual void read(std::ifstream &file) = 0; + virtual void read(std::ifstream &file); /** * @brief save layer Weight & Bias data from file + * @note derived class can call this to get/save updatableParams * @param[in] file output file stream */ - virtual void save(std::ofstream &file) = 0; + virtual void save(std::ofstream &file); /** * @brief set Property of layer @@ -219,7 +223,7 @@ public: * @brief Copy Layer * @param[in] l Layer to be copied */ - virtual void copy(std::shared_ptr l) = 0; + virtual void copy(std::shared_ptr l); /** * @brief set Batch Normalization Layer followed @@ -307,20 +311,10 @@ public: void setTrainable(bool train) { trainable = train; } /** - * @brief get gradients - * @retval shared ptr of vector of all tensors + * @brief get updatable params of all + * @retval vector of all params */ - std::shared_ptr> getGradients() { - return getObjFromRef(gradients); - } - - /** - * @brief get weights - * @retval shared ptr of vector of all tensors - */ - std::shared_ptr> getWeights() { - return getObjFromRef(weights); - } + std::shared_ptr getParams() { return params; } /** * @brief get if the output of this layer must be flatten @@ -480,25 +474,61 @@ protected: bool trainable; /** - * @brief Gradient for the weights in this layer - * @note The order of gradients should match the order in weights + * @brief reserve memory for @a params and set @a param_size + * @exception std::invalid_argument when param_size is already set and + * shouldn't be changed again. + */ + void setParamSize(unsigned int psize) { + + // @note Need opinion about this + // if (param_size > 0) { + // throw std::invalid_argument("param size can't be set once it is set"); + // } + + param_size = psize; + params = std::shared_ptr( + new UpdatableParam[psize], std::default_delete()); + } + + /** + * @brief get data alias at param position. + * @exception std::out_of_range for index out of range */ - std::vector> gradients; + UpdatableParam ¶msAt(const unsigned int position) { + if (position >= param_size) { + throw std::out_of_range("index out of range"); + } + + return params.get()[position]; + } /** - * @brief weights in this layer - * @note The weights are combined with their corresponding bias - * For example- with W0, W1, B0 and B1, weights would be of format - * {W0, B0, W1, B1}. + * @brief updatable params in this layer. This contains params of layers. + * @note UpdatableParam has weights and gradients paired. */ - std::vector> weights; + std::shared_ptr params; + + unsigned int param_size; /**< length of UpdatableParam * params. + This shouldn't be changed + after initiation + use setParamSize() to avoid + setting parameters twice */ private: /** - * @brief Convert vector of reference to vector of objects + * @brief Set containing all the names of layers + */ + static std::set layer_names; + + /** + * @brief Count assigned to layer names declared by default + */ + static int def_name_count; + + /** + * @brief Ensure that layer has a name */ - std::shared_ptr> - getObjFromRef(std::vector> &elements); + void ensureName(); }; } // namespace nntrainer diff --git a/nntrainer/include/optimizer.h b/nntrainer/include/optimizer.h index 07457e0..00f40e8 100644 --- a/nntrainer/include/optimizer.h +++ b/nntrainer/include/optimizer.h @@ -24,11 +24,21 @@ #ifdef __cplusplus #include +#include #include namespace nntrainer { /** + * @brief UpdatableParam that could be updated thorugh optimizer + */ +struct UpdatableParam { + Tensor weight; /**< weight to be updated and used */ + Tensor grad; /**< gradient for the weight */ + std::string name; /**< name of the parameter */ +}; + +/** * @brief Enumeration of Optimizer * 0. SGD * 1. ADAM @@ -170,13 +180,12 @@ public: /** * @brief apply gradient to weights - * @param[in] weights vector of weights - * @param[in] gradients vector of corresponding gradients + * @param[in] params array of updatable params. + * @param[in] param_size size of the array * @param[in] iteration nth epoch number */ - void apply_gradients(std::vector> &weights, - std::vector> &gradients, - int iteration); + void apply_gradients(std::shared_ptr params, + unsigned int param_size, int iteration); /** * @brief Property Enumeration diff --git a/nntrainer/src/bn_layer.cpp b/nntrainer/src/bn_layer.cpp index a076b3d..cf02ca2 100644 --- a/nntrainer/src/bn_layer.cpp +++ b/nntrainer/src/bn_layer.cpp @@ -32,28 +32,31 @@ namespace nntrainer { +enum class BNParams { mu, var, gamma, beta }; + /// @todo add channel wise bn for convolutional layer. int BatchNormalizationLayer::initialize(bool last) { int status = ML_ERROR_NONE; dim = input_dim; - dim = input_dim; dim.batch(1); output_dim = input_dim; - this->mu = Tensor(dim); - this->var = Tensor(dim); - this->gamma = Tensor(dim); - this->beta = Tensor(dim); + Tensor mu = Tensor(dim); + Tensor var = Tensor(dim); + Tensor gamma = Tensor(dim); + Tensor beta = Tensor(dim); mu.setZero(); var.setValue(1); gamma.setZero(); beta.setZero(); - weights.clear(); - weights.push_back(gamma); - weights.push_back(beta); + setParamSize(4); + paramsAt(0) = {std::move(mu), Tensor(dim), "BN:moving_average"}; + paramsAt(1) = {std::move(var), Tensor(dim), "BN:moving_variance"}; + paramsAt(2) = {std::move(gamma), Tensor(dim), "BN:gamma"}; + paramsAt(3) = {std::move(beta), Tensor(dim), "BN:beta"}; return status; } @@ -90,12 +93,16 @@ int BatchNormalizationLayer::setProperty(std::vector values) { } Tensor BatchNormalizationLayer::forwarding(Tensor in, int &status) { + Tensor &mu = paramsAt(static_cast(BNParams::mu)).weight; + Tensor &var = paramsAt(static_cast(BNParams::var)).weight; + Tensor &gamma = paramsAt(static_cast(BNParams::gamma)).weight; + Tensor &beta = paramsAt(static_cast(BNParams::beta)).weight; if (trainable) { Tensor deviation; this->input = in; - ///< current mu / var */ + ///< current mu */ Tensor cmu; cmu = in.average(0); @@ -111,10 +118,10 @@ Tensor BatchNormalizationLayer::forwarding(Tensor in, int &status) { /// @todo replace momentum paramter float momentum = 0.9; - this->mu.multiply_i(momentum); - this->mu.add_i(cmu, 1 - momentum); - this->var.multiply_i(momentum); - this->var.add_i(cvar, 1 - momentum); + mu.multiply_i(momentum); + mu.add_i(cmu, 1 - momentum); + var.multiply_i(momentum); + var.add_i(cvar, 1 - momentum); this->x_normalized = deviation.divide(cvar.apply(sqrtFloat)); @@ -130,8 +137,9 @@ Tensor BatchNormalizationLayer::forwarding(Tensor in, int &status) { } Tensor BatchNormalizationLayer::backwarding(Tensor dy, int iteration) { - Tensor dbeta; - Tensor dgamma; + Tensor &gamma = paramsAt(static_cast(BNParams::gamma)).weight; + Tensor &dbeta = paramsAt(static_cast(BNParams::beta)).grad; + Tensor &dgamma = paramsAt(static_cast(BNParams::beta)).grad; Tensor dx_normalized; Tensor dx; @@ -151,30 +159,16 @@ Tensor BatchNormalizationLayer::backwarding(Tensor dy, int iteration) { .divide_i(cvar.multiply(batch)) .run(); - gradients.clear(); - gradients.push_back(dgamma); - gradients.push_back(dbeta); + std::shared_ptr grad_params(params, params.get() + 2); - opt.apply_gradients(weights, gradients, iteration); + opt.apply_gradients(grad_params, param_size - 2, iteration); return dx; } -void BatchNormalizationLayer::read(std::ifstream &file) { - mu.read(file); - var.read(file); - gamma.read(file); - beta.read(file); -} - -void BatchNormalizationLayer::save(std::ofstream &file) { - mu.save(file); - var.save(file); - gamma.save(file); - beta.save(file); -} - void BatchNormalizationLayer::copy(std::shared_ptr l) { + Layer::copy(l); + std::shared_ptr from = std::static_pointer_cast(l); this->opt = from->opt; @@ -184,12 +178,6 @@ void BatchNormalizationLayer::copy(std::shared_ptr l) { this->output_dim = from->output_dim; this->input.copy(from->input); this->hidden.copy(from->hidden); - this->weight.copy(from->weight); - this->bias.copy(from->bias); - this->mu.copy(from->mu); - this->var.copy(from->var); this->cvar.copy(from->cvar); - this->gamma.copy(from->gamma); - this->beta.copy(from->beta); } } /* namespace nntrainer */ diff --git a/nntrainer/src/conv2d_layer.cpp b/nntrainer/src/conv2d_layer.cpp index dd2470c..f2932a5 100644 --- a/nntrainer/src/conv2d_layer.cpp +++ b/nntrainer/src/conv2d_layer.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include namespace nntrainer { @@ -30,38 +31,43 @@ int Conv2DLayer::initialize(bool last) { } this->last_layer = last; - TensorDim Kdim; - Kdim.channel(input_dim.channel()); - Kdim.height(kernel_size[0]); - Kdim.width(kernel_size[1]); - dim = Kdim; + dim = TensorDim(1, input_dim.channel(), kernel_size[0], kernel_size[1]); - weights.clear(); - for (unsigned int i = 0; i < filter_size; ++i) { - Tensor Knl = initializeWeight(Kdim, weight_ini_type, status); - NN_RETURN_STATUS(); + std::string kernelPrefix = "Conv2d:filter"; + std::string biasPrefix = "Conv2d:bias"; + setParamSize(filter_size * 2); - delK.push_back( - Tensor(input_dim.batch(), Kdim.channel(), Kdim.height(), Kdim.width())); - delBias.push_back(Tensor(input_dim.batch(), 1, 1, 1)); + // fixme: #280 + TensorDim gradDim(input_dim.batch(), dim.channel(), dim.height(), + dim.width()); + TensorDim gradBiasDim(input_dim.batch(), 1, 1, 1); - std::vector::iterator iter; - for (iter = delK.begin(); iter != delK.end(); ++iter) - (*iter).setZero(); - for (iter = delBias.begin(); iter != delBias.end(); ++iter) - (*iter).setZero(); + for (unsigned int i = 0; i < filter_size; ++i) { + Tensor Knl = initializeWeight(dim, weight_ini_type, status); + NN_RETURN_STATUS(); - filters.push_back(Knl); - weights.push_back(Knl); + Tensor bias = Tensor(1, 1); - Tensor B(1, 1, 1, 1); - B.setZero(); if (!bias_init_zero) { - B.apply([&](float x) { return random(); }); + bias.apply([&](float x) { return random(); }); + } else { + bias.setZero(); } - bias.push_back(B); - weights.push_back(B); + Tensor delK(gradDim); + delK.setZero(); + + Tensor delBias(gradBiasDim); + delBias.setZero(); + + /*< @note: order of weight and bias are: + w0 w1 w2 ... w3 + */ + paramsAt(i) = {std::move(Knl), std::move(delK), + kernelPrefix + std::to_string(i)}; + paramsAt(i + filter_size) = {std::move(bias), std::move(delBias), + biasPrefix + std::to_string(i)}; } + // this output_dim should be the same with dimension of hidden output_dim.batch(input_dim.batch()); output_dim.channel(filter_size); @@ -73,17 +79,9 @@ int Conv2DLayer::initialize(bool last) { return status; } -void Conv2DLayer::read(std::ifstream &file) { - std::for_each(filters.begin(), filters.end(), - [&](Tensor &i) { i.read(file); }); - std::for_each(bias.begin(), bias.end(), [&](Tensor &i) { i.read(file); }); -} +void Conv2DLayer::read(std::ifstream &file) { Layer::read(file); } -void Conv2DLayer::save(std::ofstream &file) { - std::for_each(filters.begin(), filters.end(), - [&](Tensor i) { i.save(file); }); - std::for_each(bias.begin(), bias.end(), [&](Tensor i) { i.save(file); }); -} +void Conv2DLayer::save(std::ofstream &file) { Layer::save(file); } Tensor Conv2DLayer::forwarding(Tensor in, int &status) { if (normalization) { @@ -100,20 +98,21 @@ Tensor Conv2DLayer::forwarding(Tensor in, int &status) { output_dim.width()); hidden.setZero(); - std::vector output; + std::vector output(output_dim.width() * output_dim.height()); - unsigned int o_size = output_dim.width() * output_dim.height(); - output.resize(o_size); for (unsigned int b = 0; b < in.batch(); ++b) { Tensor in_padded = zero_pad(b, input, padding); + for (unsigned int i = 0; i < filter_size; ++i) { - status = conv2d(in_padded.getData(), in_padded.getDim(), - filters[i].getData(), filters[i].getDim(), output.data(), - stride, bias[i].getValue(0, 0, 0, 0)); + Tensor &filter = paramsAt(i).weight; + Tensor &bias = paramsAt(i + filter_size).weight; + status = conv2d(in_padded.getData(), in_padded.getDim(), filter.getData(), + filter.getDim(), output.data(), stride, + bias.getValue(0, 0, 0, 0)); memcpy(hidden.getAddress(b * hidden.getDim().getFeatureLen() + i * hidden.height() * hidden.width()), - output.data(), o_size * sizeof(float)); + output.data(), output.size() * sizeof(float)); } } @@ -145,18 +144,19 @@ int Conv2DLayer::setOptimizer(Optimizer &opt) { Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { // Calculate delK : [batch, channel, height, width ] * filter_size - std::vector output; unsigned int same_pad[CONV2D_DIM]; unsigned int o_size = kernel_size[0] * kernel_size[1]; - - output.resize(o_size); + std::vector output(o_size); TensorDim in_dim(1, 1, derivative.height(), derivative.width()); for (unsigned int b = 0; b < input_dim.batch(); ++b) { Tensor in_padded = zero_pad(b, input, padding); TensorDim p_dim(1, 1, in_padded.height(), in_padded.width()); + /// @fixme: #280 for (unsigned int i = 0; i < filter_size; i++) { + Tensor &delK = paramsAt(i).grad; + Tensor &delBias = paramsAt(i + filter_size).grad; for (unsigned int j = 0; j < in_padded.channel(); ++j) { conv2d( in_padded.getAddress(j * in_padded.height() * in_padded.width()), @@ -164,9 +164,8 @@ Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { derivative.getAddress(b * derivative.getDim().getFeatureLen() + i * derivative.height() * derivative.width()), in_dim, output.data(), stride, 0.0); - memcpy( - delK[i].getAddress(b * delK[i].getDim().getFeatureLen() + j * o_size), - output.data(), o_size * sizeof(float)); + memcpy(delK.getAddress(b * delK.getDim().getFeatureLen() + j * o_size), + output.data(), o_size * sizeof(float)); } // Calculate delBias [ batch , 1, 1, filter_size] @@ -177,7 +176,7 @@ Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { sum += derivative.getValue(b, i, j, k); } } - delBias[i].setValue(b, 0, 0, 0, sum); + delBias.setValue(b, 0, 0, 0, sum); } } @@ -202,10 +201,10 @@ Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { for (unsigned int in_c = 0; in_c < input_dim.channel(); ++in_c) { for (unsigned int i = 0; i < derivative.channel(); ++i) { + Tensor &filter = paramsAt(i).weight; conv2d(in_padded.getAddress(i * in_padded.height() * in_padded.width()), - p_dim, - filters[i].getAddress(in_c * kernel_size[0] * kernel_size[1]), + p_dim, filter.getAddress(in_c * kernel_size[0] * kernel_size[1]), kdim, output.data(), stride, 0.0); float *ret_vec = ret.getAddress(b * ret.getDim().getFeatureLen() + in_c * ret.height() * ret.width()); @@ -216,27 +215,26 @@ Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { } } - gradients.clear(); - if (trainable) { // Update K / bias for (unsigned int i = 0; i < filter_size; ++i) { - Tensor djdw = delK[i] - .chain() - .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), - filters[i], weight_decay.lambda) - .run(); - - gradients.push_back(djdw); - gradients.push_back(delBias[i]); + Tensor &delK = paramsAt(i).grad; + Tensor &filter = paramsAt(i).weight; + + delK = delK.chain() + .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), filter, + weight_decay.lambda) + .run(); } - opt.apply_gradients(weights, gradients, iteration); + + opt.apply_gradients(params, param_size, iteration); } return rotate_180(strip_pad(ret, padding)); } void Conv2DLayer::copy(std::shared_ptr l) { + Layer::copy(l); std::shared_ptr from = std::static_pointer_cast(l); this->filter_size = from->filter_size; for (unsigned int i = 0; i < CONV2D_DIM; ++i) { @@ -245,10 +243,6 @@ void Conv2DLayer::copy(std::shared_ptr l) { this->padding[i] = from->padding[i]; } - for (int i = 0; from->filters.size(); ++i) { - this->filters.push_back(from->filters[i]); - this->bias.push_back(from->bias[i]); - } this->input.copy(from->input); this->hidden.copy(from->hidden); this->dim = from->dim; diff --git a/nntrainer/src/fc_layer.cpp b/nntrainer/src/fc_layer.cpp index 7ef6e68..50d486d 100644 --- a/nntrainer/src/fc_layer.cpp +++ b/nntrainer/src/fc_layer.cpp @@ -31,17 +31,19 @@ namespace nntrainer { +enum class FCParams { weight, bias }; + int FullyConnectedLayer::initialize(bool last) { int status = ML_ERROR_NONE; this->last_layer = last; - bias = Tensor(1, unit); + Tensor bias = Tensor(1, unit); dim = input_dim; dim.width(unit); dim.height(input_dim.width()); dim.batch(1); - weight = initializeWeight(dim, weight_ini_type, status); + Tensor weight = initializeWeight(dim, weight_ini_type, status); NN_RETURN_STATUS(); output_dim = input_dim; @@ -53,9 +55,9 @@ int FullyConnectedLayer::initialize(bool last) { bias.setRandUniform(-0.5, 0.5); } - weights.clear(); - weights.push_back(weight); - weights.push_back(bias); + setParamSize(2); + paramsAt(0) = {std::move(weight), Tensor(dim), "FC:weight"}; + paramsAt(1) = {std::move(bias), Tensor(1, unit), "FC:bias"}; return status; } @@ -98,6 +100,9 @@ int FullyConnectedLayer::setOptimizer(Optimizer &opt) { } Tensor FullyConnectedLayer::forwarding(Tensor in, int &status) { + Tensor &weight = paramsAt(static_cast(FCParams::weight)).weight; + Tensor &bias = paramsAt(static_cast(FCParams::bias)).weight; + input = in; hidden = input.chain().dot(weight).add_i(bias).run(); status = ML_ERROR_NONE; @@ -110,18 +115,18 @@ Tensor FullyConnectedLayer::forwarding(Tensor in, int &status) { } void FullyConnectedLayer::read(std::ifstream &file) { - weight.read(file); - bias.read(file); + Layer::read(file); opt.read(file); } void FullyConnectedLayer::save(std::ofstream &file) { - weight.save(file); - bias.save(file); + Layer::save(file); opt.save(file); } void FullyConnectedLayer::copy(std::shared_ptr l) { + Layer::copy(l); + std::shared_ptr from = std::static_pointer_cast(l); this->opt = from->opt; @@ -132,28 +137,28 @@ void FullyConnectedLayer::copy(std::shared_ptr l) { this->output_dim = from->output_dim; this->input.copy(from->input); this->hidden.copy(from->hidden); - this->weight.copy(from->weight); - this->bias.copy(from->bias); this->loss = from->loss; this->cost = from->cost; } Tensor FullyConnectedLayer::backwarding(Tensor derivative, int iteration) { + unsigned int weight_idx = static_cast(FCParams::weight); + unsigned int bias_idx = static_cast(FCParams::bias); + Tensor &weight = paramsAt(weight_idx).weight; + Tensor &djdw = paramsAt(weight_idx).grad; + Tensor &djdb = paramsAt(bias_idx).grad; + Tensor ret = derivative.dot(weight.transpose("0:2:1")); - Tensor djdb = derivative; - Tensor djdw = input.chain() - .transpose("0:2:1") - .dot(derivative) - .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), weight, - weight_decay.lambda) - .run(); + djdb = derivative; + djdw = input.chain() + .transpose("0:2:1") + .dot(derivative) + .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), weight, + weight_decay.lambda) + .run(); if (trainable) { - gradients.clear(); - gradients.push_back(djdw); - gradients.push_back(djdb); - - opt.apply_gradients(weights, gradients, iteration); + opt.apply_gradients(params, param_size, iteration); } return ret; diff --git a/nntrainer/src/layer.cpp b/nntrainer/src/layer.cpp index a8d595b..d28eabd 100644 --- a/nntrainer/src/layer.cpp +++ b/nntrainer/src/layer.cpp @@ -62,6 +62,25 @@ int Layer::checkValidation() { return status; } +void Layer::copy(std::shared_ptr l) { + setParamSize(l->param_size); + for (unsigned int i = 0; i < l->param_size; ++i) { + paramsAt(i) = l->paramsAt(i); + } +} + +void Layer::read(std::ifstream &file) { + for (unsigned int i = 0; i < param_size; ++i) { + paramsAt(i).weight.read(file); + } +} + +void Layer::save(std::ofstream &file) { + for (unsigned int i = 0; i < param_size; ++i) { + paramsAt(i).weight.save(file); + } +} + Tensor Layer::initializeWeight(TensorDim w_dim, WeightIniType init_type, int &status) { @@ -110,15 +129,6 @@ int Layer::setCost(CostType c) { return status; } -std::shared_ptr> -Layer::getObjFromRef(std::vector> &elements) { - std::vector ele; - for (auto iter = elements.begin(); iter != elements.end(); ++iter) - ele.push_back(*iter); - - return std::make_shared>(std::move(ele)); -} - int Layer::setProperty(std::vector values) { int status = ML_ERROR_NONE; diff --git a/nntrainer/src/optimizer.cpp b/nntrainer/src/optimizer.cpp index 60b14a6..8dcdc80 100644 --- a/nntrainer/src/optimizer.cpp +++ b/nntrainer/src/optimizer.cpp @@ -90,11 +90,10 @@ int Optimizer::initialize(TensorDim d, bool set_tensor) { return status; } -void Optimizer::apply_gradients( - std::vector> &weights, - std::vector> &gradients, int iteration) { - if (weights.size() != gradients.size()) - throw std::runtime_error("Number of gradients and weights should match."); +void Optimizer::apply_gradients(std::shared_ptr params, + unsigned int param_size, int iteration) { + + UpdatableParam *param_data = params.get(); float ll = popt.learning_rate; if (popt.decay_steps != -1) { @@ -110,14 +109,15 @@ void Optimizer::apply_gradients( } int idx = 0; - std::vector>::iterator w_iter, g_iter; - for (w_iter = weights.begin(), g_iter = gradients.begin(); - w_iter != weights.end(); ++w_iter, ++g_iter) { - Tensor &x = *w_iter; - Tensor x_grad = *g_iter; - + for (unsigned int i = 0; i < param_size; ++i) { + UpdatableParam ¶m = param_data[i]; + + Tensor &x = param.weight; + /// @fixme: #280 and use const Tensor &x_grad once fixed. + /// @note: that current implementation does not update grad since updating + /// grad changes it's dimension + Tensor x_grad = param.grad; x_grad = x_grad.average(); - switch (type) { case OptType::sgd: x.add_i(x_grad, -ll); -- 2.7.4