From 412f8a0832d10c194ba58d8cc6d44a150ff25fcf Mon Sep 17 00:00:00 2001 From: Parichay Kapoor Date: Fri, 29 Jan 2021 20:03:51 +0900 Subject: [PATCH] [weight/layer] Move weight regularization out of layers Move weight regularization out of layers to weights and remove same code from the all the layers. Loss and grads from weight regularization is done by the weight itself. Signed-off-by: Parichay Kapoor --- nntrainer/layers/bn_layer.cpp | 24 ++++++++++------ nntrainer/layers/conv2d_layer.cpp | 29 ++++++++----------- nntrainer/layers/fc_layer.cpp | 20 ++++++------- nntrainer/layers/layer.cpp | 4 +-- nntrainer/layers/layer_internal.h | 32 ++++++--------------- nntrainer/models/neuralnet.cpp | 5 ++-- nntrainer/tensor/weight.cpp | 9 ++++-- nntrainer/tensor/weight.h | 59 +++++++++++++++++++++++++++++++++------ nntrainer/utils/parse_util.cpp | 9 +++--- 9 files changed, 112 insertions(+), 79 deletions(-) diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp index 4142af4..8212118 100644 --- a/nntrainer/layers/bn_layer.cpp +++ b/nntrainer/layers/bn_layer.cpp @@ -62,18 +62,26 @@ int BatchNormalizationLayer::initialize(Manager &manager) { weights.clear(); if (weights.empty()) { weights.reserve(4); - weights.emplace_back(dim, initializers[BNParams::mu], false, + weights.emplace_back(dim, initializers[BNParams::mu], + WeightRegularizer::NONE, 1.0f, false, "BN::moving_mean"); - weights.emplace_back(dim, initializers[BNParams::var], false, + weights.emplace_back(dim, initializers[BNParams::var], + WeightRegularizer::NONE, 1.0f, false, "BN::moving_variance"); - weights.emplace_back(dim, initializers[BNParams::gamma], true, "BN::gamma"); - weights.emplace_back(dim, initializers[BNParams::beta], true, "BN::beta"); + weights.emplace_back(dim, initializers[BNParams::gamma], + WeightRegularizer::NONE, 1.0f, true, "BN::gamma"); + weights.emplace_back(dim, initializers[BNParams::beta], + WeightRegularizer::NONE, 1.0f, true, "BN::beta"); manager.trackWeights(weights); } else { - weights[BNParams::mu].reset(dim, initializers[BNParams::mu], false); - weights[BNParams::var].reset(dim, initializers[BNParams::var], false); - weights[BNParams::gamma].reset(dim, initializers[BNParams::gamma], true); - weights[BNParams::beta].reset(dim, initializers[BNParams::beta], true); + weights[BNParams::mu].reset(dim, initializers[BNParams::mu], + WeightRegularizer::NONE, 1.0f, false); + weights[BNParams::var].reset(dim, initializers[BNParams::var], + WeightRegularizer::NONE, 1.0f, false); + weights[BNParams::gamma].reset(dim, initializers[BNParams::gamma], + WeightRegularizer::NONE, 1.0f, true); + weights[BNParams::beta].reset(dim, initializers[BNParams::beta], + WeightRegularizer::NONE, 1.0f, true); } return status; diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp index d91dc9e..4dd39e0 100644 --- a/nntrainer/layers/conv2d_layer.cpp +++ b/nntrainer/layers/conv2d_layer.cpp @@ -74,12 +74,17 @@ int Conv2DLayer::initialize(Manager &manager) { if (weights.empty()) { weights.reserve(2); - weights.emplace_back(dim, weight_initializer, true, "Conv2d:filter"); - weights.emplace_back(bias_dim, bias_initializer, true, "Conv2d:bias"); + weights.emplace_back(dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, true, "Conv2d:filter"); + weights.emplace_back(bias_dim, bias_initializer, WeightRegularizer::NONE, + 1.0f, true, "Conv2d:bias"); manager.trackWeights(weights); } else { - weights[ConvParams::weight].reset(dim, weight_initializer, true); - weights[ConvParams::bias].reset(bias_dim, bias_initializer, true); + weights[ConvParams::weight].reset(dim, weight_initializer, + weight_regularizer, + weight_regularizer_constant, true); + weights[ConvParams::bias].reset(bias_dim, bias_initializer, + WeightRegularizer::NONE, 1.0f, true); } // this output_dim should be the same with dimension of hidden @@ -181,10 +186,7 @@ void Conv2DLayer::forwarding(bool training) { } END_PROFILE(add_bias_key); - loss = 0.0f; - if (weight_regularizer == WeightRegularizerType::l2norm) { - loss += weight_regularizer_constant * 0.5f * (filter_kernel.l2norm()); - } + loss = weightAt(ConvParams::weight).getRegularizationLoss(); } void Conv2DLayer::calcDerivative() { @@ -293,7 +295,6 @@ void Conv2DLayer::calcDerivative() { void Conv2DLayer::calcGradient() { TensorDim &in_dim = input_dim[0]; - Tensor &filter_kernel = weightAt(ConvParams::weight).getVariableRef(); Tensor &derivative = net_hidden[0]->getGradientRef(); Tensor &input_ = net_input[0]->getVariableRef(); @@ -336,8 +337,6 @@ void Conv2DLayer::calcGradient() { * x [input_dim.channel * kernel_size[0] * kernel_size[1] (width) ] */ - int status = ML_ERROR_NONE; - TensorDim kdim{ {derivative.channel(), derivative.height(), derivative.width()}}; @@ -359,12 +358,8 @@ void Conv2DLayer::calcGradient() { delK.reshape(out_dim); delBias = derivative.sum({0, 2, 3}); - // Update K / bias - if (isWeightRegularizerL2Norm()) { - status = delK.add_i(filter_kernel, weight_regularizer_constant); - if (status != ML_ERROR_NONE) - throw std::runtime_error("Weight regularization failed"); - } + /// calculate regularization based gradient for weight only + weightAt(ConvParams::weight).calcRegularizationGradient(); } void Conv2DLayer::copy(std::shared_ptr l) { diff --git a/nntrainer/layers/fc_layer.cpp b/nntrainer/layers/fc_layer.cpp index 7e10047..fde47d3 100644 --- a/nntrainer/layers/fc_layer.cpp +++ b/nntrainer/layers/fc_layer.cpp @@ -54,12 +54,16 @@ int FullyConnectedLayer::initialize(Manager &manager) { if (weights.empty()) { weights.reserve(2); - weights.emplace_back(dim, weight_initializer, true, "FC:weight"); - weights.emplace_back(bias_dim, bias_initializer, true, "FC:bias"); + weights.emplace_back(dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, true, "FC:weight"); + weights.emplace_back(bias_dim, bias_initializer, WeightRegularizer::NONE, + 1.0f, true, "FC:bias"); manager.trackWeights(weights); } else { - weights[FCParams::weight].reset(dim, weight_initializer, true); - weights[FCParams::bias].reset(bias_dim, bias_initializer, true); + weights[FCParams::weight].reset(dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, true); + weights[FCParams::bias].reset(bias_dim, bias_initializer, + WeightRegularizer::NONE, 1.0f, true); } return status; @@ -92,9 +96,7 @@ void FullyConnectedLayer::forwarding(bool training) { hidden_ = input_.dot(weight, hidden_); hidden_.add_i(bias); - if (weight_regularizer == WeightRegularizerType::l2norm) { - loss = weight_regularizer_constant * 0.5f * (weight.l2norm()); - } + loss = weightAt(static_cast(FCParams::weight)).getRegularizationLoss(); } void FullyConnectedLayer::copy(std::shared_ptr l) { @@ -117,7 +119,6 @@ void FullyConnectedLayer::calcDerivative() { void FullyConnectedLayer::calcGradient() { unsigned int weight_idx = static_cast(FCParams::weight); unsigned int bias_idx = static_cast(FCParams::bias); - Tensor &weight = weightAt(weight_idx).getVariableRef(); Tensor &djdw = weightAt(weight_idx).getGradientRef(); Tensor &djdb = weightAt(bias_idx).getGradientRef(); @@ -126,8 +127,7 @@ void FullyConnectedLayer::calcGradient() { djdb = derivative_.sum(0); djdw = net_input[0]->getVariableRef().dot(derivative_, djdw, true, false); - if (isWeightRegularizerL2Norm()) - djdw.add_i(weight, weight_regularizer_constant); + weightAt(weight_idx).calcRegularizationGradient(); } void FullyConnectedLayer::scaleSize(float scalesize) noexcept { diff --git a/nntrainer/layers/layer.cpp b/nntrainer/layers/layer.cpp index 5f725c0..a002957 100644 --- a/nntrainer/layers/layer.cpp +++ b/nntrainer/layers/layer.cpp @@ -232,8 +232,8 @@ void Layer::setProperty(const PropertyType type, const std::string &value) { case PropertyType::weight_regularizer: if (!value.empty()) { weight_regularizer = - (WeightRegularizerType)parseType(value, TOKEN_WEIGHT_REGULARIZER); - if (weight_regularizer == WeightRegularizerType::unknown) { + (WeightRegularizer)parseType(value, TOKEN_WEIGHT_REGULARIZER); + if (weight_regularizer == WeightRegularizer::UNKNOWN) { throw std::invalid_argument("[Layer] Unknown Weight decay"); } } diff --git a/nntrainer/layers/layer_internal.h b/nntrainer/layers/layer_internal.h index 7ecc294..b853cc3 100644 --- a/nntrainer/layers/layer_internal.h +++ b/nntrainer/layers/layer_internal.h @@ -63,14 +63,13 @@ public: /** * @brief Constructor of Layer Class */ - Layer( - ActivationType activation_type_ = ActivationType::ACT_NONE, - WeightRegularizerType weight_regularizer_ = WeightRegularizerType::unknown, - const float weight_regularizer_constant_ = 1.0f, - WeightInitializer weight_initializer_ = - WeightInitializer::WEIGHT_XAVIER_UNIFORM, - WeightInitializer bias_initializer_ = WeightInitializer::WEIGHT_ZEROS, - bool trainable_ = true, bool flatten_ = false) : + Layer(ActivationType activation_type_ = ActivationType::ACT_NONE, + WeightRegularizer weight_regularizer_ = WeightRegularizer::NONE, + const float weight_regularizer_constant_ = 1.0f, + WeightInitializer weight_initializer_ = + WeightInitializer::WEIGHT_XAVIER_UNIFORM, + WeightInitializer bias_initializer_ = WeightInitializer::WEIGHT_ZEROS, + bool trainable_ = true, bool flatten_ = false) : name(std::string()), loss(0.0f), activation_type(activation_type_), @@ -389,13 +388,6 @@ protected: std::string name; /** - * @brief check if current layer's weight decay type is l2norm - * @return bool is weightdecay type is L2 Norm - */ - bool isWeightRegularizerL2Norm() { - return weight_regularizer == WeightRegularizerType::l2norm; - } - /** * @brief Input Tensor */ Tensor input; @@ -428,7 +420,7 @@ protected: ActivationType activation_type; - WeightRegularizerType weight_regularizer; + WeightRegularizer weight_regularizer; float weight_regularizer_constant; @@ -520,14 +512,6 @@ private: virtual void printMetric(std::ostream &out); /** - * @brief set weight decay parameters - * @param[in] w struct for weight decay - */ - void setWeightRegularizer(WeightRegularizerType type) { - weight_regularizer = type; - } - - /** * @brief set Weight Initialization Type * @param[in] wini WeightInitializer */ diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp index 47a5af5..44274f7 100644 --- a/nntrainer/models/neuralnet.cpp +++ b/nntrainer/models/neuralnet.cpp @@ -643,9 +643,10 @@ int NeuralNetwork::train_run() { std::rethrow_exception(std::current_exception()); } std::cout << "#" << epoch_idx << "/" << epochs; + float loss = getLoss(); data_buffer->displayProgress(count++, nntrainer::BufferType::BUF_TRAIN, - getLoss()); - training.loss += getLoss(); + loss); + training.loss += loss; } else { data_buffer->clear(nntrainer::BufferType::BUF_TRAIN); break; diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index a87d494..65f40b8 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -16,12 +16,17 @@ namespace nntrainer { -Weight::Weight(const TensorDim &dim, const WeightInitializer init, bool train, +Weight::Weight(const TensorDim &dim, const WeightInitializer init, + const WeightRegularizer reg, const float reg_const, bool train, bool alloc_now_, std::string name) : Var_Grad(dim, train, alloc_now_, name), - initializer(init) { + initializer(init), + regularizer(reg), + regularizer_constant(reg_const) { if (initializer == WeightInitializer::WEIGHT_UNKNOWN) throw std::invalid_argument("Weight initializer unknown"); + if (regularizer == WeightRegularizer::UNKNOWN) + throw std::invalid_argument("Weight regularizer unknown"); } void Weight::initializeVariable(const Tensor &preallocated) { diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 9d0703e..70fe987 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -20,11 +20,12 @@ namespace nntrainer { /** - * @brief Enumeration of Weight Decay type + * @brief Enumeration of Weight Regularizer */ -enum class WeightRegularizerType { - l2norm, /** L2 norm regularizer */ - unknown /** Unknown */ +enum class WeightRegularizer { + L2NORM, /**< L2 norm regularization */ + NONE, /**< no regularization */ + UNKNOWN /**< Unknown */ }; /** @@ -63,20 +64,26 @@ public: /** * @brief Weight default constructor */ - Weight() : Var_Grad(), initializer(WeightInitializer::WEIGHT_UNKNOWN) {} + Weight() : + Var_Grad(), + initializer(WeightInitializer::WEIGHT_UNKNOWN), + regularizer(WeightRegularizer::UNKNOWN), + regularizer_constant(1.0f) {} /** * @brief Construct a new Weight object * * @param dim Variable and gradient tensor dimension - * @param init Initializer for the tensor + * @param init Initializer for the weight + * @param reg Regularizer for the weight * @param train If the variable is trainable * @param name Name for this weight */ Weight( const TensorDim &dim, const WeightInitializer init = WeightInitializer::WEIGHT_XAVIER_UNIFORM, - bool train = true, bool alloc_now = true, std::string name = ""); + const WeightRegularizer reg = WeightRegularizer::NONE, + const float reg_const = 1.0f, bool train = true, bool alloc_now = true, std::string name = ""); /** * @copydoc var_grad::initializeVariable(const Tensor &) @@ -99,6 +106,7 @@ public: using std::swap; swap(static_cast(lhs), static_cast(rhs)); swap(lhs.initializer, rhs.initializer); + swap(lhs.regularizer, rhs.regularizer); } /** @@ -150,13 +158,18 @@ public: * @brief Reset the weight * * @param dim Variable and gradient tensor dimension - * @param init Initializer for the tensor + * @param init Initializer for the weight + * @param reg Regularizer for the weight * @param train If the variable is trainable * * @note New dimension must maintain the shape of the variable */ - void reset(const TensorDim &dim, const WeightInitializer init, bool train) { + void reset(const TensorDim &dim, const WeightInitializer init, + const WeightRegularizer reg, const float reg_const, bool train) { initializer = init; + regularizer = reg; + regularizer_constant = reg_const; + Var_Grad::reset(dim, train); } @@ -201,8 +214,36 @@ public: allocateOptimizerVariables(); } + /** + * @brief check if weight regularizer type is l2norm + * @return bool is weight regrulatizer type is L2 Norm + */ + bool isWeightRegularizerL2Norm() { + return regularizer == WeightRegularizer::L2NORM; + } + + /** + * @brief Get loss from the regularization of the weight + */ + float getRegularizationLoss() { + if (isWeightRegularizerL2Norm()) + return regularizer_constant * 0.5f * var->l2norm(); + + return 0; + } + + /** + * @brief Calculate gradient from the regularizaiton of the weight + */ + void calcRegularizationGradient() { + if (isWeightRegularizerL2Norm()) + grad->add_i(*var.get(), regularizer_constant); + } + private: WeightInitializer initializer; /**< initializer for this variable */ + WeightRegularizer regularizer; /**< regularizer for this variable */ + float regularizer_constant; /**< constant factor for regularization */ std::vector opt_vars; /**< optimizer variables */ std::vector opt_vars_dim; /**< optimizer variables dimensions */ diff --git a/nntrainer/utils/parse_util.cpp b/nntrainer/utils/parse_util.cpp index 2a472a7..4cd5c61 100644 --- a/nntrainer/utils/parse_util.cpp +++ b/nntrainer/utils/parse_util.cpp @@ -111,12 +111,11 @@ unsigned int parseType(std::string ll, InputType t) { "xavier_normal", "xavier_uniform", "he_normal", "he_uniform"}; /** - * @brief Weight Decay String from configure file + * @brief Weight Regularization String from configure file * "L2Norm" : squared norm regularization - * "Regression" : Regression + * "None" : none */ - std::array weight_regularizer_string = {"l2norm", - "regression"}; + std::array weight_regularizer_string = {"l2norm", "none"}; /** * @brief Weight Decay String from configure file @@ -183,7 +182,7 @@ unsigned int parseType(std::string ll, InputType t) { return (i); } } - ret = (unsigned int)WeightRegularizerType::unknown; + ret = (unsigned int)WeightRegularizer::UNKNOWN; break; case TOKEN_PADDING: for (i = 0; i < padding_string.size(); i++) { -- 2.7.4