From: jijoong.moon Date: Wed, 16 Jun 2021 08:20:26 +0000 (+0900) Subject: [ GRU ] Add GRU Unittest X-Git-Tag: accepted/tizen/unified/20210829.234903~271 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c7e378a0d6d1d73b16c9f28eaebe62397ce58784;p=platform%2Fcore%2Fml%2Fnntrainer.git [ GRU ] Add GRU Unittest This commit includes, . unittests of gru layer . add keras code to compare **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- diff --git a/nntrainer/layers/gru.cpp b/nntrainer/layers/gru.cpp index 6a6fe7b..f5cb230 100644 --- a/nntrainer/layers/gru.cpp +++ b/nntrainer/layers/gru.cpp @@ -9,6 +9,21 @@ * @author Jijoong Moon * @bug No known bugs except for NYI items * + * h_prev --------d1------->[*]-------d0----->[+]---d0--> h + * dh_nx | | | | d0 dh + * | d14 | d2 d3 | + * | | +-----[1-]------>[*] + * | [*]<---+ d15 |d5 | d6 + * | | |rt | zt |gt + * | | [sig] [sig] [tanh] + * | | |d16 | d7 |d8 + * | | [+] [+] [+] + * | | / \d16 | \ d7 / \ d8 + * | | Wxhr Whhr Wxhz Whhz Wxhg Whhg + * | | |d17 |d13 |d12 |d11 |d10 | d9 + * +- |--+------|---+ | | | + * +---------|--------|----------+ | + * xs------------------+--------+---------------+ */ #include @@ -203,14 +218,18 @@ void GRULayer::forwarding(bool training) { hs_prev = h_prev.getBatchSlice(b, 1); } - xs.dot(weight_xh, zrg_t); + xs.dot(weight_xh, zrg_t); // x_z, x_r, x_g Tensor ztrt = zrg_t.getSharedDataTensor({unit * 2}, 0); Tensor ztrt_b = bias_h.getSharedDataTensor({unit * 2}, 0); - Tensor w_hh = weight_hh.getSharedDataTensor({unit * unit * 2}, 0); - Tensor w_g = - weight_hh.getSharedDataTensor({unit * unit}, unit * unit * 2); + Tensor w_hh; + w_hh.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false)); + Tensor w_g; + w_g.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false)); + Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2); Tensor gt_b = bias_h.getSharedDataTensor({unit}, unit * 2); @@ -223,13 +242,15 @@ void GRULayer::forwarding(bool training) { recurrent_acti_func.run_fn(rt, rt); recurrent_acti_func.run_fn(zt, zt); - gt.add_i(rt.multiply(hs_prev).dot(w_g)); + Tensor temp; + rt.multiply(hs_prev, temp); + gt.add_i(temp.dot(w_g)); gt.add_i(gt_b); acti_func.run_fn(gt, gt); zt.multiply(hs_prev, hs); - Tensor a = zt.multiply(-1.0).add(1.0); - hs.add_i(gt.multiply(a)); + temp = zt.multiply(-1.0).add(1.0); + hs.add_i(gt.multiply(temp)); } h_prev.getBatchSlice(b, 1).copy(hs); } @@ -279,7 +300,10 @@ void GRULayer::calcGradient() { weightAt(static_cast(GRUParams::weight_hh)).getVariableRef(); djdw_x.setZero(); - djdw_h.setZero(); + Tensor djdw_zr_h = Tensor({1, 1, unit, unit * 2}, true); + djdw_zr_h.setZero(); + Tensor djdw_g_h = Tensor({1, 1, unit, unit}, true); + djdw_g_h.setZero(); djdb_h.setZero(); hidden->getGradientRef().setZero(); @@ -313,7 +337,6 @@ void GRULayer::calcGradient() { Tensor dh; Tensor hs_prev; - Tensor hs; Tensor xs; Tensor dzrg_ = zrg->getGradientRef().getBatchSlice(b, 1); Tensor zrg_ = zrg->getVariableRef().getBatchSlice(b, 1); @@ -321,7 +344,6 @@ void GRULayer::calcGradient() { for (unsigned int t = deriv_t.height(); t-- > 0;) { dh = deriv_t.getSharedDataTensor({deriv_t.width()}, t * deriv_t.width()); xs = xs_t.getSharedDataTensor({xs_t.width()}, t * xs_t.width()); - hs = hs_t.getSharedDataTensor({hs_t.width()}, t * hs_t.width()); Tensor dzrg_t = dzrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE); @@ -347,38 +369,56 @@ void GRULayer::calcGradient() { Tensor rt = zrg_t.getSharedDataTensor({unit}, unit); Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2); - dh.multiply(hs_prev, dhz); - dhz.subtract_i(gt.multiply(dh)); + zt.multiply(dh, dh_nx); // dh_nx = d1 + + dh.multiply(hs_prev, dhz); // dhz = d2 + dhz.subtract_i(gt.multiply(dh)); // dhz = d5 zt.multiply(-1.0, dhg); dhg.add_i(1.0); - dhg.multiply_i(dh); - recurrent_acti_func.run_prime_fn(zt, dhz, dhz); - acti_func.run_prime_fn(gt, dhg, dhg); + dhg.multiply_i(dh); // dhg = d6 - Tensor dhzr = dzrg_t.getSharedDataTensor({unit * 2}, 0); - Tensor djdw_zr_h = djdw_h.getSharedDataTensor({unit * unit * 2}, 0); - Tensor djdw_g_h = - djdw_h.getSharedDataTensor({unit * unit}, unit * unit * 2); + recurrent_acti_func.run_prime_fn(zt, dhz, dhz); // dhz = d7 + acti_func.run_prime_fn(gt, dhg, dhg); // dhg = d8 - Tensor wg_hh = - weight_hh.getSharedDataTensor({unit * unit}, unit * unit * 2); - Tensor wzr_hh = weight_hh.getSharedDataTensor({unit * unit * 2}, 0); + Tensor dhzr = dzrg_t.getSharedDataTensor({unit * 2}, 0); // dhz+dhr - dhg.multiply(wg_hh, dh_nx); - hs_prev.multiply(dh_nx, dhr); - dh_nx.multiply_i(rt); - recurrent_acti_func.run_prime_fn(rt, dhr, dhr); + Tensor wg_hh; + wg_hh.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false)); + Tensor wzr_hh; + wzr_hh.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false)); - djdb_h.add_i(dzrg_t); + Tensor temp = Tensor({hs_t.width()}); + temp.setZero(); + dhg.dot(wg_hh, temp, false, true); // temp = d10 + hs_prev.multiply(temp, dhr); // dhr = d15 + temp.multiply_i(rt); // temp=d14 + dh_nx.add_i(temp); // dh_nx = d1 + d14 + // reset temp : hs_prev * rt for djdw_g_h + hs_prev.multiply(rt, temp); + recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16 + + djdb_h.add_i(dzrg_t); // dzrg_t = d7+d16+d8 djdw_x.add_i(xs.dot(dzrg_t, true, false)); - djdw_zr_h.add_i(hs_prev.dot(dhzr, true, false)); - djdw_g_h.add_i(hs_prev.multiply(rt).dot(dhg, true, false)); - dhzr.dot(wzr_hh, dh_nx, false, true); - dh_nx.add_i(zt.multiply(dh)); + djdw_zr_h.add_i(hs_prev.dot(dhzr, true, false)); + djdw_g_h.add_i(temp.dot(dhg, true, false)); + dhzr.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17 } } + for (unsigned int h = 0; h < unit; ++h) { + float *data = djdw_zr_h.getAddress(h * unit * 2); + float *rdata = djdw_h.getAddress(h * unit * NUM_GATE); + std::copy(data, data + unit * 2, rdata); + } + + for (unsigned int h = 0; h < unit; ++h) { + float *data = djdw_g_h.getAddress(h * unit); + float *rdata = djdw_h.getAddress(h * unit * NUM_GATE + unit * 2); + std::copy(data, data + unit, rdata); + } } } // namespace nntrainer diff --git a/nntrainer/layers/gru.h b/nntrainer/layers/gru.h index ae88a62..f920799 100644 --- a/nntrainer/layers/gru.h +++ b/nntrainer/layers/gru.h @@ -30,11 +30,14 @@ public: * @brief Constructor of GRULayer */ template - GRULayer(unsigned int unit_ = 0, - ActivationType recurrent_activation_type_ = ActivationType::ACT_NONE, - bool sequence = false, Args... args) : + GRULayer( + unsigned int unit_ = 0, + ActivationType hidden_state_activation_type_ = ActivationType::ACT_NONE, + ActivationType recurrent_activation_type_ = ActivationType::ACT_NONE, + bool sequence = false, Args... args) : LayerV1(args...), unit(unit_), + hidden_state_activation_type(hidden_state_activation_type_), recurrent_activation_type(recurrent_activation_type_), return_sequences(sequence){}; diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp index a3f1925..7e02b99 100644 --- a/nntrainer/layers/lstm.cpp +++ b/nntrainer/layers/lstm.cpp @@ -328,7 +328,6 @@ void LSTMLayer::calcGradient() { Tensor xs; Tensor hs_prev; Tensor cs_prev; - Tensor hs; Tensor cs; Tensor dc; Tensor dfgio_ = fgio->getGradientRef().getBatchSlice(b, 1); @@ -339,7 +338,6 @@ void LSTMLayer::calcGradient() { dc = derivc_t.getSharedDataTensor({derivc_t.width()}, t * derivc_t.width()); xs = xs_t.getSharedDataTensor({xs_t.width()}, t * xs_t.width()); - hs = hs_t.getSharedDataTensor({hs_t.width()}, t * hs_t.width()); cs = cs_t.getSharedDataTensor({cs_t.width()}, t * cs_t.width()); Tensor dfgio_t = diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index a3dcca6..6b95376 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -497,8 +497,8 @@ void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest, src.src_tensor->tensor(), offset + src.src_tensor->offset()); } -Tensor Tensor::getSharedDataTensor(const TensorDim dim_, - unsigned int offset) const { +Tensor Tensor::getSharedDataTensor(const TensorDim dim_, unsigned int offset, + bool reset_stride) const { Tensor ret = *this; if (dim_.getDataLen() + offset > dim.getDataLen()) @@ -506,7 +506,8 @@ Tensor Tensor::getSharedDataTensor(const TensorDim dim_, "Creating shared tensor of size bigger than tensor memory."); ret.dim = dim_; - ret.strides = ret.dim.computeStrides(); + if (reset_stride) + ret.strides = ret.dim.computeStrides(); /** * In this case, its the caller's responsibility to ensure that allocate() is @@ -715,6 +716,7 @@ Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const { Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m, float beta) const { if (m.dim.rank() > 2) { + throw exception::not_supported("Error: support only for rank of dot " "matrix <= 2"); } @@ -969,6 +971,46 @@ void Tensor::copy(const float *buf) noexcept { scopy(length(), buf, 1, getData(), 1); } +void Tensor::copy_with_stride(const Tensor &from) { + if (from.length() != 0 && length() == from.length()) { + reshape(from.getDim()); + for (unsigned int b = 0; b < from.batch(); ++b) { + unsigned int from_b = b * from.strides[0]; + unsigned int t_b = b * from.channel() * from.height() * from.width(); + for (unsigned int c = 0; c < from.channel(); ++c) { + unsigned int from_c = c * from.strides[1]; + unsigned int t_c = c * from.height() * from.width(); + for (unsigned int h = 0; h < from.height(); ++h) { + unsigned int from_h = h * from.strides[2]; + unsigned int t_h = h * from.width(); + for (unsigned int w = 0; w < from.width(); ++w) { + unsigned int from_w = w * from.strides[3]; + getData()[t_b + t_c + t_h + w] = + from.getData()[from_b + from_c + from_h + from_w]; + } + } + } + } + } else { + Tensor t = Tensor(from.getDim(), true); + for (unsigned int b = 0; b < from.batch(); ++b) { + unsigned int from_b = b * from.strides[0]; + for (unsigned int c = 0; c < from.channel(); ++c) { + unsigned int from_c = c * from.strides[1]; + for (unsigned int h = 0; h < from.height(); ++h) { + unsigned int from_h = h * from.strides[2]; + for (unsigned int w = 0; w < from.width(); ++w) { + unsigned int from_w = w * from.strides[3]; + t.setValue(b, c, h, w, + from.getData()[from_b + from_c + from_h + from_w]); + } + } + } + } + swap(t, *this); + } +} + void Tensor::copy(const Tensor &from) { // todo: enable copy to non-contiguous tensor if (!is_contiguous) { diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 8af82af..5a98da7 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -739,6 +739,12 @@ public: void copy(const Tensor &from); /** + * @brief Copy the Tensor + * @param[in] from Tensor to be copied + */ + void copy_with_stride(const Tensor &from); + + /** * @brief Get slice of the tensor, sliced by batch * @param[in] offset offset in batch to start the slice * @param[in] size size of the slice @@ -759,7 +765,8 @@ public: * @note New size added with offset must be less than the size of the original * tensor. */ - Tensor getSharedDataTensor(const TensorDim dim, unsigned int offset) const; + Tensor getSharedDataTensor(const TensorDim dim, unsigned int offset, + bool reset_stride = true) const; /** * @brief make this tensor share memory with given tensor diff --git a/packaging/unittest_models.tar.gz b/packaging/unittest_models.tar.gz index 7f9f0ec..0d8adbc 100644 Binary files a/packaging/unittest_models.tar.gz and b/packaging/unittest_models.tar.gz differ diff --git a/test/input_gen/genModelTests.py b/test/input_gen/genModelTests.py index 5bde0ce..41fdfe5 100644 --- a/test/input_gen/genModelTests.py +++ b/test/input_gen/genModelTests.py @@ -425,3 +425,50 @@ if __name__ == "__main__": ) multi_rnn_layer_tc(1,2)(file_name="multi_rnn_return_sequence.info") multi_rnn_layer_tc(2,2)(file_name="multi_rnn_return_sequence_with_batch.info") + + gru_layer_tc = lambda batch, time, return_sequences: partial( + record, + model=[ + K.Input(batch_shape=(batch, time, 1)), + K.layers.GRU( + time, + recurrent_activation="sigmoid", + activation="tanh", + return_sequences=return_sequences, + ), + K.layers.Dense(1), + ], + optimizer=opt.SGD(learning_rate=0.1), + iteration=10, + input_shape=(batch, time, 1), + label_shape=(batch, time, 1), + is_onehot=False, + loss_fn_str="mse" + ) + + gru_layer_tc(1, 1, False)(file_name="gru_basic.info") + gru_layer_tc(1, 2, True)(file_name="gru_return_sequence.info") + gru_layer_tc(2, 2, True)(file_name="gru_return_sequence_with_batch.info") + + multi_gru_layer_tc = lambda batch, time: partial( + record, + model=[ + K.Input(batch_shape=(batch, time, 1)), + K.layers.GRU( + time, + recurrent_activation="sigmoid", + activation="tanh", + return_sequences=True, + ), + K.layers.GRU(time, recurrent_activation="sigmoid", activation="tanh"), + K.layers.Dense(1), + ], + optimizer=opt.SGD(learning_rate=0.1), + iteration=10, + input_shape=(batch, time, 1), + label_shape=(batch, 1), + is_onehot=False, + loss_fn_str="mse", + ) + multi_gru_layer_tc(1,2)(file_name="multi_gru_return_sequence.info") + multi_gru_layer_tc(2,2)(file_name="multi_gru_return_sequence_with_batch.info") diff --git a/test/unittest/unittest_nntrainer_models.cpp b/test/unittest/unittest_nntrainer_models.cpp index deab140..7eadc1a 100644 --- a/test/unittest/unittest_nntrainer_models.cpp +++ b/test/unittest/unittest_nntrainer_models.cpp @@ -1208,6 +1208,71 @@ INI multi_rnn_return_sequence_with_batch( } ); +INI gru_basic( + "gru_basic", + { + nn_base + "loss=mse | batch_size=1", + sgd_base + "learning_rate = 0.1", + I("input") + input_base + "input_shape=1:1:1", + I("gru") + gru_base + + "unit = 1" + "input_layers=input", + I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru" + } +); + +INI gru_return_sequence( + "gru_return_sequence", + { + nn_base + "loss=mse | batch_size=1", + sgd_base + "learning_rate = 0.1", + I("input") + input_base + "input_shape=1:2:1", + I("gru") + gru_base + + "unit = 2" + "input_layers=input"+ "return_sequences=true", + I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru" + } +); + +INI gru_return_sequence_with_batch( + "gru_return_sequence_with_batch", + { + nn_base + "loss=mse | batch_size=2", + sgd_base + "learning_rate = 0.1", + I("input") + input_base + "input_shape=1:2:1", + I("gru") + gru_base + + "unit = 2" + "input_layers=input"+ "return_sequences=true", + I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru" + } +); + +INI multi_gru_return_sequence( + "multi_gru_return_sequence", + { + nn_base + "loss=mse | batch_size=1", + sgd_base + "learning_rate = 0.1", + I("input") + input_base + "input_shape=1:2:1", + I("gru") + gru_base + + "unit = 2" + "input_layers=input"+ "return_sequences=true", + I("gru2") + gru_base + + "unit = 2" + "input_layers=gru", + I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2" + } +); + + +INI multi_gru_return_sequence_with_batch( + "multi_gru_return_sequence_with_batch", + { + nn_base + "loss=mse | batch_size=2", + sgd_base + "learning_rate = 0.1", + I("input") + input_base + "input_shape=1:2:1", + I("gru") + gru_base + + "unit = 2" + "input_layers=input"+ "return_sequences=true", + I("gru2") + gru_base + + "unit = 2" + "input_layers=gru", + I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2" + } +); + INSTANTIATE_TEST_CASE_P( nntrainerModelAutoTests, nntrainerModelTest, ::testing::Values( mkModelTc(fc_sigmoid_mse, "3:1:1:10", 10), @@ -1260,7 +1325,12 @@ INSTANTIATE_TEST_CASE_P( mkModelTc(rnn_return_sequences, "1:1:2:1", 10), mkModelTc(rnn_return_sequence_with_batch, "2:1:2:1", 10), mkModelTc(multi_rnn_return_sequence, "1:1:1:1", 10), - mkModelTc(multi_rnn_return_sequence_with_batch, "2:1:1:1", 10) + mkModelTc(multi_rnn_return_sequence_with_batch, "2:1:1:1", 10), + mkModelTc(gru_basic, "1:1:1:1", 10), + mkModelTc(gru_return_sequence, "1:1:2:1", 10), + mkModelTc(gru_return_sequence_with_batch, "2:1:2:1", 10), + mkModelTc(multi_gru_return_sequence, "1:1:1:1", 10), + mkModelTc(multi_gru_return_sequence_with_batch, "2:1:1:1", 10) ), [](const testing::TestParamInfo& info){ return std::get<0>(info.param).getName(); });