From ce6ced462cb549d32952a6551ba09968c7cc6f9d Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Thu, 25 Mar 2021 16:39:47 +0900 Subject: [PATCH] [ RNN ] Backwarding Implementation This commit includes calculate gradient and derivatives of RNN Layer. Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/layers/rnn.cpp | 74 +++++++++++++++++++++++++++-- test/unittest/unittest_nntrainer_layers.cpp | 33 +++++++++---- 2 files changed, 95 insertions(+), 12 deletions(-) diff --git a/nntrainer/layers/rnn.cpp b/nntrainer/layers/rnn.cpp index b44d6d2..e8bb892 100644 --- a/nntrainer/layers/rnn.cpp +++ b/nntrainer/layers/rnn.cpp @@ -86,6 +86,7 @@ int RNNLayer::initialize(Manager &manager) { void RNNLayer::setProperty(const PropertyType type, const std::string &value) { int status = ML_ERROR_NONE; + // TODO : Add return_state property & api to get the hidden input switch (type) { case PropertyType::unit: { if (!value.empty()) { @@ -123,6 +124,9 @@ void RNNLayer::forwarding(bool training) { Tensor hs_prev; Tensor hs; + if (training) + h_prev.setZero(); + // TODO : check merge b and t index for (unsigned int b = 0; b < input_dim[0].batch(); ++b) { Tensor islice = input_.getBatchSlice(b, 1); @@ -151,7 +155,8 @@ void RNNLayer::forwarding(bool training) { // TODO : In-place calculation for activation acti_func.run_fn(hs, hs); } - h_prev.getBatchSlice(b, 1).copy(hs); + if (!training) + h_prev.getBatchSlice(b, 1).copy(hs); } } @@ -163,11 +168,74 @@ void RNNLayer::copy(std::shared_ptr l) { } void RNNLayer::calcDerivative() { - // NYI + Tensor &derivative_ = net_hidden[0]->getGradientRef(); + Tensor &weight = + weightAt(static_cast(RNNParams::weight_xh)).getVariableRef(); + Tensor &ret_ = net_input[0]->getGradientRef(); + + derivative_.dot(weight, ret_, false, true); } void RNNLayer::calcGradient() { - // NYI + Tensor &djdw_x = + weightAt(static_cast(RNNParams::weight_xh)).getGradientRef(); + Tensor &djdw_h = + weightAt(static_cast(RNNParams::weight_hh)).getGradientRef(); + Tensor &djdb_h = + weightAt(static_cast(RNNParams::bias_h)).getGradientRef(); + Tensor &weight_hh = + weightAt(static_cast(RNNParams::weight_hh)).getVariableRef(); + + Tensor &derivative_ = net_hidden[0]->getGradientRef(); + Tensor &hidden_ = net_hidden[0]->getVariableRef(); + Tensor &input_ = net_input[0]->getVariableRef(); + Tensor dh_nx = Tensor(TensorDim(1, 1, 1, derivative_.width())); + + for (unsigned int b = 0; b < input_dim[0].batch(); ++b) { + Tensor deriv_t = derivative_.getBatchSlice(b, 1); + Tensor xs_t = input_.getBatchSlice(b, 1); + Tensor hs_t = hidden_.getBatchSlice(b, 1); + dh_nx.setZero(); + + Tensor dh; + Tensor xs; + Tensor hs_prev; + Tensor hs; + + for (unsigned int t = deriv_t.height(); t-- > 0;) { + dh = deriv_t.getSharedDataTensor(TensorDim(1, 1, 1, deriv_t.width()), + t * deriv_t.width()); + xs = xs_t.getSharedDataTensor(TensorDim(1, 1, 1, xs_t.width()), + t * xs_t.width()); + hs = hs_t.getSharedDataTensor(TensorDim(1, 1, 1, hs_t.width()), + t * hs_t.width()); + if (t == 0) { + hs_prev = Tensor(TensorDim(1, 1, 1, hs_t.width())); + hs_prev.setZero(); + } else { + hs_prev = hs_t.getSharedDataTensor(TensorDim(1, 1, 1, hs_t.width()), + (t - 1) * hs_t.width()); + } + + if (t < deriv_t.height() - 1) { + dh.add_i(dh_nx); + } + + acti_func.run_prime_fn(hs, dh, dh); + dh.multiply_i(hs); + + float alpha = 1.0; + + if (b != 0) { + alpha = 0.0; + } + + djdb_h.add_i(dh, alpha); + djdw_x.add_i(xs.dot(dh, true, false), alpha); + djdw_h.add_i(hs_prev.dot(dh, true, false), alpha); + dh.dot(weight_hh, dh_nx, false, true, 1.0); + } + } } } // namespace nntrainer diff --git a/test/unittest/unittest_nntrainer_layers.cpp b/test/unittest/unittest_nntrainer_layers.cpp index 40880ad..e57e9a1 100644 --- a/test/unittest/unittest_nntrainer_layers.cpp +++ b/test/unittest/unittest_nntrainer_layers.cpp @@ -2351,24 +2351,39 @@ TEST_F(nntrainer_RNNLayer, initialize_01_p) { } TEST_F(nntrainer_RNNLayer, forwarding_01_p) { + + status = reinitialize(); + EXPECT_EQ(status, ML_ERROR_NONE); float data[18] = {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8}; sharedTensor input = std::shared_ptr( new nntrainer::Tensor[1], std::default_delete()); nntrainer::Tensor &in = *input; in = nntrainer::Tensor(nntrainer::TensorDim(2, 1, 3, 3), data); - nntrainer::Manager manager; - manager.setInferenceInOutMemoryOptimization(false); + allocateMemory(); + EXPECT_NO_THROW(layer.forwarding_with_val({input}, {}, false)); +} - layer.setInputBuffers(manager.trackLayerInputs( - layer.getType(), layer.getName(), layer.getInputDimension())); - layer.setOutputBuffers(manager.trackLayerOutputs( - layer.getType(), layer.getName(), layer.getOutputDimension())); +TEST_F(nntrainer_RNNLayer, backwarding_01_p) { + status = reinitialize(); + EXPECT_EQ(status, ML_ERROR_NONE); + float data[18] = {1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8}; - manager.initializeWeights(); - manager.initializeTensors(false); - manager.allocateTensors(); + sharedTensor input = std::shared_ptr( + new nntrainer::Tensor[1], std::default_delete()); + nntrainer::Tensor &in = *input; + in = nntrainer::Tensor(nntrainer::TensorDim(2, 1, 3, 3), data); + allocateMemory(); EXPECT_NO_THROW(layer.forwarding_with_val({input}, {}, false)); + + nntrainer::Tensor derivatives(2, 1, 3, 3); + derivatives.setValue(1.0); + + setOptimizer(nntrainer::OptType::SGD, "learning_rate=1.0"); + + nntrainer::Tensor result; + EXPECT_NO_THROW(result = *layer.backwarding_with_val( + 1, {MAKE_SHARED_TENSOR(derivatives)}, opt)[0]); } /** -- 2.7.4