[rnn] enable 2 bias
authorhyeonseok lee <hs89.lee@samsung.com>
Thu, 9 Dec 2021 02:48:07 +0000 (11:48 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Mon, 13 Dec 2021 02:50:23 +0000 (11:50 +0900)
 - Enable bias_hh in rnn

Self evaluation:

Build test: [X]Passed [ ]Failed [ ]Skipped
Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: hyeonseok lee <hs89.lee@samsung.com>
nntrainer/layers/rnn.cpp
nntrainer/layers/rnn.h
test/unittest/layers/unittest_layers_rnn.cpp
test/unittest/unittest_nntrainer_models.cpp

index 6e4af91..ab4e880 100644 (file)
@@ -23,99 +23,123 @@ namespace nntrainer {
 
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
-// - weight_xh ( input to hidden )
-//  : [1, 1, input_size, unit (hidden_size) ]
+// - weight_ih ( input to hidden )
 // - weight_hh ( hidden to hidden )
-//  : [1, 1, unit (hidden_size) , unit (hidden_size)]
-// - bias_h ( hidden bias )
-//  : [1, 1, 1, unit (hidden_size)]
-enum RNNParams { weight_xh, weight_hh, bias_h, hidden_state, dropout_mask };
+// - bias_h ( input bias, hidden bias )
+// - bias_ih ( input bias )
+// - bias_hh ( hidden bias )
+enum RNNParams {
+  weight_ih,
+  weight_hh,
+  bias_h,
+  bias_ih,
+  bias_hh,
+  hidden_state,
+  dropout_mask
+};
 
 RNNLayer::RNNLayer() :
   LayerImpl(),
-  rnn_props(props::Unit(), props::HiddenStateActivation(),
-            props::ReturnSequences(), props::DropOutRate()),
+  rnn_props(
+    props::Unit(), props::HiddenStateActivation() = ActivationType::ACT_TANH,
+    props::ReturnSequences(), props::DropOutRate(), props::IntegrateBias()),
   acti_func(ActivationType::ACT_NONE, true),
   epsilon(1e-3) {
   wt_idx.fill(std::numeric_limits<unsigned>::max());
 }
 
 void RNNLayer::finalize(InitLayerContext &context) {
-  auto &weight_regularizer =
+  const nntrainer::WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props);
-  auto &weight_regularizer_constant =
+  const float weight_regularizer_constant =
     std::get<props::WeightRegularizerConstant>(*layer_impl_props);
-  auto &weight_initializer =
+  const Tensor::Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props);
-  auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
-
-  auto unit = std::get<props::Unit>(rnn_props).get();
-  auto &hidden_state_activation_type =
-    std::get<props::HiddenStateActivation>(rnn_props);
-  bool return_sequences = std::get<props::ReturnSequences>(rnn_props);
-  float dropout_rate = std::get<props::DropOutRate>(rnn_props);
+  const Tensor::Initializer bias_initializer =
+    std::get<props::BiasInitializer>(*layer_impl_props);
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(rnn_props).get();
+  const nntrainer::ActivationType hidden_state_activation_type =
+    std::get<props::HiddenStateActivation>(rnn_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(rnn_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(rnn_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(rnn_props).get();
 
   if (context.getNumInputs() != 1) {
     throw std::invalid_argument("RNN layer takes only one input");
   }
 
-  TensorDim output_dim;
-  const TensorDim &input_dim = context.getInputDimensions()[0];
-
   // input_dim = [ batch, 1, time_iteration, feature_size ]
-  // outut_dim = [ batch, 1, time_iteration, hidden_size ( unit ) ]
-  output_dim = input_dim;
-  output_dim.width(unit);
+  const TensorDim &input_dim = context.getInputDimensions()[SINGLE_INOUT_IDX];
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  const unsigned int feature_size = input_dim.width();
 
-  if (dropout_rate > epsilon) {
-    wt_idx[RNNParams::dropout_mask] = context.requestTensor(
-      output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
-  }
-
-  if (!return_sequences) {
-    output_dim.height(1u);
-  }
+  // output_dim = [ batch, 1, (return_sequences ? time_iteration : 1), unit ]
+  const TensorDim output_dim(batch_size, 1, return_sequences ? max_timestep : 1,
+                             unit);
 
   context.setOutputDimensions({output_dim});
 
-  TensorDim bias_dim = TensorDim();
-  bias_dim.setTensorDim(3, unit);
-
-  TensorDim dim_xh = output_dim;
-  dim_xh.height(input_dim.width());
-  dim_xh.batch(1);
-
-  TensorDim dim_hh = output_dim;
-  dim_hh.height(unit);
-  dim_hh.batch(1);
-
-  // weight_initializer can be set seperately. weight_xh initializer,
+  // weight_initializer can be set seperately. weight_ih initializer,
   // weight_hh initializer kernel initializer & recurrent_initializer in keras
   // for now, it is set same way.
 
-  wt_idx[RNNParams::weight_xh] =
-    context.requestWeight(dim_xh, weight_initializer, weight_regularizer,
-                          weight_regularizer_constant, "weight_xh", true);
+  // weight_ih_dim : [ 1, 1, feature_size, unit ]
+  const TensorDim weight_ih_dim({feature_size, unit});
+  wt_idx[RNNParams::weight_ih] =
+    context.requestWeight(weight_ih_dim, weight_initializer, weight_regularizer,
+                          weight_regularizer_constant, "weight_ih", true);
+  // weight_hh_dim : [ 1, 1, unit, unit ]
+  const TensorDim weight_hh_dim({unit, unit});
   wt_idx[RNNParams::weight_hh] =
-    context.requestWeight(dim_hh, weight_initializer, weight_regularizer,
+    context.requestWeight(weight_hh_dim, weight_initializer, weight_regularizer,
                           weight_regularizer_constant, "weight_hh", true);
-  wt_idx[RNNParams::bias_h] = context.requestWeight(
-    bias_dim, bias_initializer, WeightRegularizer::NONE, 1.0f, "bias_h", true);
+  if (!disable_bias) {
+    if (integrate_bias) {
+      // bias_h_dim : [ 1, 1, 1, unit ]
+      const TensorDim bias_h_dim({unit});
+      wt_idx[RNNParams::bias_h] =
+        context.requestWeight(bias_h_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_h", true);
+    } else {
+      // bias_ih_dim : [ 1, 1, 1, unit ]
+      const TensorDim bias_ih_dim({unit});
+      wt_idx[RNNParams::bias_ih] =
+        context.requestWeight(bias_ih_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_ih", true);
+      // bias_hh_dim : [ 1, 1, 1, unit ]
+      const TensorDim bias_hh_dim({unit});
+      wt_idx[RNNParams::bias_hh] =
+        context.requestWeight(bias_hh_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_hh", true);
+    }
+  }
 
   // We do not need this if we reuse net_hidden[0]. But if we do, then the unit
   // test will fail. Becuase it modifies the data during gradient calculation
   // TODO : We could control with something like #define test to save memory
-  TensorDim d = input_dim;
-  d.width(unit);
-  wt_idx[RNNParams::hidden_state] =
-    context.requestTensor(d, "hidden_state", Tensor::Initializer::NONE, true,
-                          TensorLifespan::ITERATION_LIFESPAN);
-
-  if (hidden_state_activation_type.get() == ActivationType::ACT_NONE) {
-    hidden_state_activation_type.set(ActivationType::ACT_TANH);
+
+  // hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
+  const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit);
+  wt_idx[RNNParams::hidden_state] = context.requestTensor(
+    hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
+    TensorLifespan::ITERATION_LIFESPAN);
+
+  if (dropout_rate > epsilon) {
+    // dropout_mask_dim = [ batch, 1, (return_sequences ? time_iteration : 1),
+    // unit ]
+    const TensorDim dropout_mask_dim(batch_size, 1,
+                                     return_sequences ? max_timestep : 1, unit);
+    wt_idx[RNNParams::dropout_mask] = context.requestTensor(
+      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
+      TensorLifespan::ITERATION_LIFESPAN);
   }
-  acti_func.setActiFunc(hidden_state_activation_type.get());
+
+  acti_func.setActiFunc(hidden_state_activation_type);
 
   if (!acti_func.supportInPlace())
     throw exception::not_supported(
@@ -123,7 +147,8 @@ void RNNLayer::finalize(InitLayerContext &context) {
 }
 
 void RNNLayer::setProperty(const std::vector<std::string> &values) {
-  auto remain_props = loadProperties(values, rnn_props);
+  const std::vector<std::string> &remain_props =
+    loadProperties(values, rnn_props);
   LayerImpl::setProperty(remain_props);
 }
 
@@ -133,133 +158,197 @@ void RNNLayer::exportTo(Exporter &exporter, const ExportMethods &method) const {
 }
 
 void RNNLayer::forwarding(RunLayerContext &context, bool training) {
-  bool return_sequences = std::get<props::ReturnSequences>(rnn_props);
-  float dropout_rate = std::get<props::DropOutRate>(rnn_props);
-
-  Tensor &weight_xh = context.getWeight(wt_idx[RNNParams::weight_xh]);
-  Tensor &weight_hh = context.getWeight(wt_idx[RNNParams::weight_hh]);
-  Tensor &bias_h = context.getWeight(wt_idx[RNNParams::bias_h]);
-
-  Tensor &hidden_ = context.getTensor(wt_idx[RNNParams::hidden_state]);
-  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-  const TensorDim &input_dim = input_.getDim();
-
-  // TODO: swap b and t index with transpose
-  for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-    Tensor islice = input_.getBatchSlice(b, 1);
-    Tensor oslice = hidden_.getBatchSlice(b, 1);
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(rnn_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(rnn_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(rnn_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(rnn_props).get();
+
+  const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+  const TensorDim &input_dim = input.getDim();
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  const unsigned int feature_size = input_dim.width();
+  Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
 
-    for (unsigned int t = 0; t < islice.height(); ++t) {
-      Tensor xs =
-        islice.getSharedDataTensor({islice.width()}, t * islice.width());
+  const Tensor &weight_ih = context.getWeight(wt_idx[RNNParams::weight_ih]);
+  const Tensor &weight_hh = context.getWeight(wt_idx[RNNParams::weight_hh]);
+  Tensor empty;
+  Tensor &bias_h = !disable_bias && integrate_bias
+                     ? context.getWeight(wt_idx[RNNParams::bias_h])
+                     : empty;
+  Tensor &bias_ih = !disable_bias && !integrate_bias
+                      ? context.getWeight(wt_idx[RNNParams::bias_ih])
+                      : empty;
+  Tensor &bias_hh = !disable_bias && !integrate_bias
+                      ? context.getWeight(wt_idx[RNNParams::bias_hh])
+                      : empty;
+
+  Tensor &hidden_state = context.getTensor(wt_idx[RNNParams::hidden_state]);
+
+  // TODO: swap batch and timestep index with transpose
+  for (unsigned int batch = 0; batch < batch_size; ++batch) {
+    Tensor input_slice = input.getBatchSlice(batch, 1);
+    Tensor hidden_state_slice = hidden_state.getBatchSlice(batch, 1);
+
+    for (unsigned int timestep = 0; timestep < max_timestep; ++timestep) {
+      Tensor in = input_slice.getSharedDataTensor({feature_size},
+                                                  timestep * feature_size);
       Tensor hs =
-        oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
-
-      xs.dot(weight_xh, hs);
-      hs.add_i(bias_h);
+        hidden_state_slice.getSharedDataTensor({unit}, timestep * unit);
+
+      in.dot(weight_ih, hs);
+      if (!disable_bias) {
+        if (integrate_bias) {
+          hs.add_i(bias_h);
+        } else {
+          hs.add_i(bias_ih);
+          hs.add_i(bias_hh);
+        }
+      }
 
-      if (t > 0) {
-        Tensor hs_prev = oslice.getSharedDataTensor({oslice.width()},
-                                                    (t - 1) * oslice.width());
-        hs_prev.dot(weight_hh, hs, false, false, 1.0);
+      if (timestep) {
+        Tensor prev_hs =
+          hidden_state_slice.getSharedDataTensor({unit}, (timestep - 1) * unit);
+        prev_hs.dot(weight_hh, hs, false, false, 1.0);
       }
 
       // In-place calculation for activation
       acti_func.run_fn(hs, hs);
 
       if (dropout_rate > epsilon && training) {
-        Tensor mask_ = context.getTensor(wt_idx[RNNParams::dropout_mask])
-                         .getBatchSlice(b, 1);
-        Tensor msk =
-          mask_.getSharedDataTensor({mask_.width()}, t * mask_.width());
-        msk.dropout_mask(dropout_rate);
-        hs.multiply_i(msk);
+        Tensor dropout_mask = context.getTensor(wt_idx[RNNParams::dropout_mask])
+                                .getBatchSlice(batch, 1);
+        Tensor dropout_mask_t =
+          dropout_mask.getSharedDataTensor({unit}, timestep * unit);
+        dropout_mask_t.dropout_mask(dropout_rate);
+        hs.multiply_i(dropout_mask_t);
       }
     }
   }
 
-  Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
   if (!return_sequences) {
-    TensorDim d = hidden_.getDim();
-    for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-      float *data = hidden_.getAddress(b * d.width() * d.height() +
-                                       (d.height() - 1) * d.width());
-      float *rdata = output.getAddress(b * d.width());
-      std::copy(data, data + d.width(), rdata);
+    for (unsigned int batch = 0; batch < input_dim.batch(); ++batch) {
+      float *hidden_state_data = hidden_state.getAddress(
+        batch * unit * max_timestep + (max_timestep - 1) * unit);
+      float *output_data = output.getAddress(batch * unit);
+      std::copy(hidden_state_data, hidden_state_data + unit, output_data);
     }
   } else {
-    output.copy(hidden_);
+    output.copy(hidden_state);
   }
 }
 
 void RNNLayer::calcDerivative(RunLayerContext &context) {
-  Tensor &derivative_ = context.getTensorGrad(wt_idx[RNNParams::hidden_state]);
-  Tensor &weight = context.getWeight(wt_idx[RNNParams::weight_xh]);
-  Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+  const Tensor &hidden_state_derivative =
+    context.getTensorGrad(wt_idx[RNNParams::hidden_state]);
+  const Tensor &weight = context.getWeight(wt_idx[RNNParams::weight_ih]);
+  Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
 
-  derivative_.dot(weight, ret_, false, true);
+  hidden_state_derivative.dot(weight, outgoing_derivative, false, true);
 }
 
 void RNNLayer::calcGradient(RunLayerContext &context) {
-  bool return_sequences = std::get<props::ReturnSequences>(rnn_props);
-  float dropout_rate = std::get<props::DropOutRate>(rnn_props);
-
-  Tensor &djdw_x = context.getWeightGrad(wt_idx[RNNParams::weight_xh]);
-  Tensor &djdw_h = context.getWeightGrad(wt_idx[RNNParams::weight_hh]);
-  Tensor &djdb_h = context.getWeightGrad(wt_idx[RNNParams::bias_h]);
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(rnn_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(rnn_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(rnn_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(rnn_props).get();
+
+  Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+  const TensorDim &input_dim = input.getDim();
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+
+  Tensor &djdweight_ih = context.getWeightGrad(wt_idx[RNNParams::weight_ih]);
   Tensor &weight_hh = context.getWeight(wt_idx[RNNParams::weight_hh]);
-
-  Tensor &derivative_ = context.getTensorGrad(wt_idx[RNNParams::hidden_state]);
-  Tensor &incoming_deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
-  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-  const TensorDim &input_dim = input_.getDim();
-
-  djdw_x.setZero();
-  djdw_h.setZero();
-  djdb_h.setZero();
-  derivative_.setZero();
+  Tensor &djdweight_hh = context.getWeightGrad(wt_idx[RNNParams::weight_hh]);
+  Tensor empty;
+  Tensor &djdbias_h = !disable_bias && integrate_bias
+                        ? context.getWeightGrad(wt_idx[RNNParams::bias_h])
+                        : empty;
+  Tensor &djdbias_ih = !disable_bias && !integrate_bias
+                         ? context.getWeightGrad(wt_idx[RNNParams::bias_ih])
+                         : empty;
+  Tensor &djdbias_hh = !disable_bias && !integrate_bias
+                         ? context.getWeightGrad(wt_idx[RNNParams::bias_hh])
+                         : empty;
+
+  Tensor &hidden_state_derivative =
+    context.getTensorGrad(wt_idx[RNNParams::hidden_state]);
+
+  djdweight_ih.setZero();
+  djdweight_hh.setZero();
+  if (!disable_bias) {
+    if (integrate_bias) {
+      djdbias_h.setZero();
+    } else {
+      djdbias_ih.setZero();
+      djdbias_hh.setZero();
+    }
+  }
+  hidden_state_derivative.setZero();
 
   if (!return_sequences) {
-    TensorDim d = derivative_.getDim();
-    for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-      float *data = derivative_.getAddress(b * d.width() * d.height() +
-                                           (d.height() - 1) * d.width());
-      float *rdata = incoming_deriv.getAddress(b * d.width());
-      std::copy(rdata, rdata + d.width(), data);
+    for (unsigned int batch = 0; batch < batch_size; ++batch) {
+      float *hidden_state_derivative_data = hidden_state_derivative.getAddress(
+        batch * unit * max_timestep + (max_timestep - 1) * unit);
+      float *incoming_derivative_data =
+        incoming_derivative.getAddress(batch * unit);
+      std::copy(incoming_derivative_data, incoming_derivative_data + unit,
+                hidden_state_derivative_data);
     }
   } else {
-    derivative_.copy(incoming_deriv);
+    hidden_state_derivative.copy(incoming_derivative);
   }
 
   if (dropout_rate > epsilon) {
-    derivative_.multiply_i(context.getTensor(wt_idx[RNNParams::dropout_mask]));
+    hidden_state_derivative.multiply_i(
+      context.getTensor(wt_idx[RNNParams::dropout_mask]));
   }
 
-  Tensor &hidden_ = context.getTensor(wt_idx[RNNParams::hidden_state]);
+  Tensor &hidden_state = context.getTensor(wt_idx[RNNParams::hidden_state]);
 
-  for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-    Tensor deriv_t = derivative_.getBatchSlice(b, 1);
-    Tensor xs_t = input_.getBatchSlice(b, 1);
-    Tensor hs_t = hidden_.getBatchSlice(b, 1);
+  for (unsigned int batch = 0; batch < batch_size; ++batch) {
+    Tensor deriv_t = hidden_state_derivative.getBatchSlice(batch, 1);
+    Tensor input_t = input.getBatchSlice(batch, 1);
+    Tensor hidden_state_t = hidden_state.getBatchSlice(batch, 1);
 
-    for (unsigned int t = deriv_t.height(); t-- > 0;) {
+    for (unsigned int timestep = max_timestep; timestep-- > 0;) {
       Tensor dh = deriv_t.getSharedDataTensor(
-        TensorDim(1, 1, 1, deriv_t.width()), t * deriv_t.width());
-      Tensor xs = xs_t.getSharedDataTensor(TensorDim(1, 1, 1, xs_t.width()),
-                                           t * xs_t.width());
-      Tensor hs = hs_t.getSharedDataTensor(TensorDim(1, 1, 1, hs_t.width()),
-                                           t * hs_t.width());
+        TensorDim(1, 1, 1, deriv_t.width()), timestep * deriv_t.width());
+      Tensor xs = input_t.getSharedDataTensor(
+        TensorDim(1, 1, 1, input_t.width()), timestep * input_t.width());
+      Tensor hs = hidden_state_t.getSharedDataTensor(
+        TensorDim(1, 1, 1, hidden_state_t.width()),
+        timestep * hidden_state_t.width());
 
       acti_func.run_prime_fn(hs, dh, dh);
-      djdb_h.add_i(dh);
-      xs.dot(dh, djdw_x, true, false, 1.0);
-
-      if (t > 0) {
-        Tensor hs_prev = hs_t.getSharedDataTensor(
-          TensorDim(1, 1, 1, hs_t.width()), (t - 1) * hs_t.width());
-        Tensor dh_t_1 = deriv_t.getSharedDataTensor(
-          TensorDim(1, 1, 1, deriv_t.width()), (t - 1) * deriv_t.width());
-        hs_prev.dot(dh, djdw_h, true, false, 1.0);
+      if (!disable_bias) {
+        if (integrate_bias) {
+          djdbias_h.add_i(dh);
+        } else {
+          djdbias_ih.add_i(dh);
+          djdbias_hh.add_i(dh);
+        }
+      }
+      xs.dot(dh, djdweight_ih, true, false, 1.0);
+
+      if (timestep) {
+        Tensor prev_hs = hidden_state_t.getSharedDataTensor(
+          TensorDim(1, 1, 1, hidden_state_t.width()),
+          (timestep - 1) * hidden_state_t.width());
+        Tensor dh_t_1 =
+          deriv_t.getSharedDataTensor(TensorDim(1, 1, 1, deriv_t.width()),
+                                      (timestep - 1) * deriv_t.width());
+        prev_hs.dot(dh, djdweight_hh, true, false, 1.0);
         dh.dot(weight_hh, dh_t_1, false, true, 1.0);
       }
     }
index b1b148f..ed6e472 100644 (file)
@@ -103,12 +103,13 @@ private:
    * HiddenStateActivation: activation type for hidden state. default is tanh
    * ReturnSequence: option for return sequence
    * DropOutRate: dropout rate
+   * IntegrateBias: Integrate bias_ih, bias_hh to bias_h
    *
    * */
   std::tuple<props::Unit, props::HiddenStateActivation, props::ReturnSequences,
-             props::DropOutRate>
+             props::DropOutRate, props::IntegrateBias>
     rnn_props;
-  std::array<unsigned int, 5> wt_idx; /**< indices of the weights */
+  std::array<unsigned int, 7> wt_idx; /**< indices of the weights */
 
   /**
    * @brief     activation function for h_t : default is tanh
index e839892..5c279a8 100644 (file)
@@ -24,7 +24,7 @@ INSTANTIATE_TEST_CASE_P(RNN, LayerSemantics, ::testing::Values(semantic_rnn));
 
 auto rnn_single_step = LayerGoldenTestParamType(
   nntrainer::createLayer<nntrainer::RNNLayer>,
-  {"unit=5", "return_sequences=false"}, "3:1:1:7",
+  {"unit=5", "return_sequences=false", "integrate_bias=true"}, "3:1:1:7",
   "rnn_single_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
 
 INSTANTIATE_TEST_CASE_P(RNN, LayerGoldenTest,
index c4d07b9..3019869 100644 (file)
@@ -579,7 +579,7 @@ INI rnn_basic(
     sgd_base + "learning_rate = 0.1",
     I("input") + input_base + "input_shape=1:1:1",
     I("rnn") + rnn_base +
-      "unit = 2" + "input_layers=input",
+      "unit = 2" + "input_layers=input" + "integrate_bias=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn"
   }
 );
@@ -591,7 +591,7 @@ INI rnn_return_sequences(
     sgd_base + "learning_rate = 0.1",
     I("input") + input_base + "input_shape=1:2:1",
     I("rnn") + rnn_base +
-      "unit = 2" + "input_layers=input" + "return_sequences=true",
+      "unit = 2" + "input_layers=input" + "return_sequences=true" + "integrate_bias=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn"
   }
 );
@@ -631,7 +631,7 @@ INI rnn_return_sequence_with_batch(
     sgd_base + "learning_rate = 0.1",
     I("input") + input_base + "input_shape=1:2:1",
     I("rnn") + rnn_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 2" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn"
   }
 );
@@ -643,9 +643,9 @@ INI multi_rnn_return_sequence(
     sgd_base + "learning_rate = 0.1",
     I("input") + input_base + "input_shape=1:2:1",
     I("rnn") + rnn_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 2" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true",
     I("rnn2") + rnn_base +
-      "unit = 2" + "input_layers=rnn",
+      "unit = 2" + "input_layers=rnn" + "integrate_bias=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn2"
   }
 );
@@ -657,9 +657,9 @@ INI multi_rnn_return_sequence_with_batch(
     sgd_base + "learning_rate = 0.1",
     I("input") + input_base + "input_shape=1:2:1",
     I("rnn") + rnn_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 2" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true",
     I("rnn2") + rnn_base +
-      "unit = 2" + "input_layers=rnn",
+      "unit = 2" + "input_layers=rnn" + "integrate_bias=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn2"
   }
 );