From 7b39bd4309450dbefb92667a1465783d2c6ba528 Mon Sep 17 00:00:00 2001 From: Parichay Kapoor Date: Thu, 16 Jul 2020 20:39:29 +0900 Subject: [PATCH] [optimzer/layers] Gradient dimension should match weight dim Gradient dimension should match weight dimension Currently optimizer applies averaging of gradients, which is not correct Apply averaging of gradients before calling applyGradients Resolves #280 Signed-off-by: Parichay Kapoor --- nntrainer/src/bn_layer.cpp | 8 ++++---- nntrainer/src/conv2d_layer.cpp | 6 +++++- nntrainer/src/fc_layer.cpp | 8 ++++---- nntrainer/src/optimizer.cpp | 6 +----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/nntrainer/src/bn_layer.cpp b/nntrainer/src/bn_layer.cpp index 8b0e026..862b9c4 100644 --- a/nntrainer/src/bn_layer.cpp +++ b/nntrainer/src/bn_layer.cpp @@ -53,10 +53,10 @@ int BatchNormalizationLayer::initialize(bool last) { beta.setZero(); setParamSize(4); - paramsAt(0) = {std::move(mu), Tensor(dim), "BN:moving_average"}; - paramsAt(1) = {std::move(var), Tensor(dim), "BN:moving_variance"}; - paramsAt(2) = {std::move(gamma), Tensor(dim), "BN:gamma"}; - paramsAt(3) = {std::move(beta), Tensor(dim), "BN:beta"}; + paramsAt(0) = {std::move(mu), Tensor(mu.getDim()), "BN:moving_average"}; + paramsAt(1) = {std::move(var), Tensor(var.getDim()), "BN:moving_variance"}; + paramsAt(2) = {std::move(gamma), Tensor(gamma.getDim()), "BN:gamma"}; + paramsAt(3) = {std::move(beta), Tensor(beta.getDim()), "BN:beta"}; return status; } diff --git a/nntrainer/src/conv2d_layer.cpp b/nntrainer/src/conv2d_layer.cpp index b3589c5..a5a3d5f 100644 --- a/nntrainer/src/conv2d_layer.cpp +++ b/nntrainer/src/conv2d_layer.cpp @@ -224,7 +224,11 @@ Tensor Conv2DLayer::backwarding(Tensor derivative, int iteration) { delK = delK.chain() .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), filter, weight_decay.lambda) - .run(); + .run().average(0); + } + for (unsigned int i = filter_size; i < 2 * filter_size; ++i) { + Tensor &delBias = paramsAt(i).grad; + delBias = delBias.average(0); } opt.apply_gradients(params, param_size, iteration); diff --git a/nntrainer/src/fc_layer.cpp b/nntrainer/src/fc_layer.cpp index f8a6403..019daec 100644 --- a/nntrainer/src/fc_layer.cpp +++ b/nntrainer/src/fc_layer.cpp @@ -56,8 +56,8 @@ int FullyConnectedLayer::initialize(bool last) { } setParamSize(2); - paramsAt(0) = {std::move(weight), Tensor(dim), "FC:weight"}; - paramsAt(1) = {std::move(bias), Tensor(1, unit), "FC:bias"}; + paramsAt(0) = {std::move(weight), Tensor(weight.getDim()), "FC:weight"}; + paramsAt(1) = {std::move(bias), Tensor(bias.getDim()), "FC:bias"}; return status; } @@ -139,13 +139,13 @@ Tensor FullyConnectedLayer::backwarding(Tensor derivative, int iteration) { Tensor &djdb = paramsAt(bias_idx).grad; Tensor ret = derivative.dot(weight.transpose("0:2:1")); - djdb = derivative; + djdb = derivative.average(0); djdw = input.chain() .transpose("0:2:1") .dot(derivative) .applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), weight, weight_decay.lambda) - .run(); + .run().average(0); if (trainable) { opt.apply_gradients(params, param_size, iteration); diff --git a/nntrainer/src/optimizer.cpp b/nntrainer/src/optimizer.cpp index 501e248..5a3e578 100644 --- a/nntrainer/src/optimizer.cpp +++ b/nntrainer/src/optimizer.cpp @@ -113,11 +113,7 @@ void Optimizer::apply_gradients(std::shared_ptr params, UpdatableParam ¶m = param_data[i]; Tensor &x = param.weight; - /// @fixme: #280 and use const Tensor &x_grad once fixed. - /// @note: that current implementation does not update grad since updating - /// grad changes it's dimension - Tensor x_grad = param.grad; - x_grad = x_grad.average(0); + const Tensor &x_grad = param.grad; switch (type) { case OptType::sgd: x.add_i(x_grad, -ll); -- 2.7.4