[gru] enable bias_hh, reset_after
authorhyeonseok lee <hs89.lee@samsung.com>
Fri, 10 Dec 2021 08:22:41 +0000 (17:22 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Wed, 15 Dec 2021 02:40:58 +0000 (11:40 +0900)
 - Enable bias_hh in gru.
 - Enable reset_after in gru, grucell. If reset_after is set to true,
   apply reset gate after matrix multiplication.

close #1768

Self evaluation:

Build test: [X]Passed [ ]Failed [ ]Skipped
Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: hyeonseok lee <hs89.lee@samsung.com>
14 files changed:
nntrainer/layers/common_properties.h
nntrainer/layers/gru.cpp
nntrainer/layers/gru.h
nntrainer/layers/grucell.cpp
nntrainer/layers/grucell.h
packaging/unittest_layers_v2.tar.gz
packaging/unittest_models.tar.gz
test/input_gen/genLayerTests.py
test/input_gen/genModelTests.py
test/input_gen/transLayer.py
test/unittest/layers/unittest_layers_gru.cpp
test/unittest/layers/unittest_layers_grucell.cpp
test/unittest/models/unittest_models_recurrent.cpp
test/unittest/unittest_nntrainer_models.cpp

index 0f46a46..bcfced0 100644 (file)
@@ -585,6 +585,24 @@ public:
 };
 
 /**
+ * @brief ResetAfter property, apply reset gate after matrix multiplication if
+ * this property is true. Apply before the multiplication if false. Used in gru,
+ * grucell.
+ *
+ */
+class ResetAfter : public nntrainer::Property<bool> {
+
+public:
+  /**
+   * @brief Construct a new ResetAfter object with a default value true
+   *
+   */
+  ResetAfter(bool value = true) : nntrainer::Property<bool>(value) {}
+  static constexpr const char *key = "reset_after"; /**< unique key to access */
+  using prop_tag = bool_prop_tag;                   /**< property type */
+};
+
+/**
  * @brief Number of class
  * @todo deprecate this
  */
index c1c674b..46d11fe 100644 (file)
@@ -39,9 +39,11 @@ namespace nntrainer {
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 enum GRUParams {
-  weight_xh,
+  weight_ih,
   weight_hh,
   bias_h,
+  bias_ih,
+  bias_hh,
   hidden_state,
   zrg,
   h_prev,
@@ -50,118 +52,124 @@ enum GRUParams {
 
 GRULayer::GRULayer() :
   LayerImpl(),
-  gru_props(props::Unit(), props::HiddenStateActivation(),
-            props::RecurrentActivation(), props::ReturnSequences(),
-            props::DropOutRate()),
+  gru_props(props::Unit(),
+            props::HiddenStateActivation() = ActivationType::ACT_TANH,
+            props::RecurrentActivation() = ActivationType::ACT_SIGMOID,
+            props::ReturnSequences(), props::DropOutRate(),
+            props::IntegrateBias(), props::ResetAfter()),
   acti_func(ActivationType::ACT_NONE, true),
   recurrent_acti_func(ActivationType::ACT_NONE, true),
   epsilon(1e-3) {
   wt_idx.fill(std::numeric_limits<unsigned>::max());
 }
 
-// - weight_xh ( input to hidden )
-//  : [1, 1, input_size, unit (hidden_size) x NUM_GATE] -> z, r, g
-// - weight_hh ( hidden to hidden )
-//  : [1, 1, unit (hidden_size) , unit (hidden_size) x NUM_GATE] -> z, r, g
-// - bias_h ( hidden bias )
-//  : [1, 1, 1, unit (hidden_size) x NUM_GATE] -> z, r, g
 void GRULayer::finalize(InitLayerContext &context) {
-  auto &weight_regularizer =
-    std::get<props::WeightRegularizer>(*layer_impl_props);
-  auto &weight_regularizer_constant =
-    std::get<props::WeightRegularizerConstant>(*layer_impl_props);
-  auto &weight_initializer =
-    std::get<props::WeightInitializer>(*layer_impl_props);
-  auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
-
-  auto unit = std::get<props::Unit>(gru_props).get();
-  auto &hidden_state_activation_type =
-    std::get<props::HiddenStateActivation>(gru_props);
-  auto &recurrent_activation_type =
-    std::get<props::RecurrentActivation>(gru_props);
-  bool return_sequences = std::get<props::ReturnSequences>(gru_props);
-  float dropout_rate = std::get<props::DropOutRate>(gru_props);
+  const Tensor::Initializer weight_initializer =
+    std::get<props::WeightInitializer>(*layer_impl_props).get();
+  const Tensor::Initializer bias_initializer =
+    std::get<props::BiasInitializer>(*layer_impl_props).get();
+  const WeightRegularizer weight_regularizer =
+    std::get<props::WeightRegularizer>(*layer_impl_props).get();
+  const float weight_regularizer_constant =
+    std::get<props::WeightRegularizerConstant>(*layer_impl_props).get();
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(gru_props).get();
+  ActivationType hidden_state_activation_type =
+    std::get<props::HiddenStateActivation>(gru_props).get();
+  ActivationType recurrent_activation_type =
+    std::get<props::RecurrentActivation>(gru_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(gru_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
 
   if (context.getNumInputs() != 1) {
     throw std::invalid_argument("GRU layer takes only one input");
   }
 
-  TensorDim output_dim;
+  // input_dim = [ batch, 1, time_iteration, feature_size ]
   const TensorDim &input_dim = context.getInputDimensions()[0];
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  const unsigned int feature_size = input_dim.width();
 
-  // input_dim = [ batch, 1, time_iteration, feature_size ]
   // if return_sequences == False :
-  //      output_dim = [ batch, 1, 1, hidden_size (unit)]
+  //      output_dim = [ batch, 1, 1, unit ]
   // else:
-  //      output_dim = [ batch, 1, time_iteration, hidden_size ( unit ) ]
-  output_dim = input_dim;
-  output_dim.width(unit);
-
-  if (dropout_rate > epsilon) {
-    wt_idx[GRUParams::dropout_mask] = context.requestTensor(
-      output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
-  }
-
-  if (!return_sequences) {
-    output_dim.height(1);
-  }
-
+  //      output_dim = [ batch, 1, time_iteration, unit ]
+  TensorDim output_dim(
+    {batch_size, 1, return_sequences ? max_timestep : 1, unit});
   context.setOutputDimensions({output_dim});
 
-  TensorDim bias_dim = TensorDim();
-  bias_dim.setTensorDim(3, unit * NUM_GATE);
-
-  TensorDim dim_xh = output_dim;
-  dim_xh.height(input_dim.width());
-  dim_xh.width(unit * NUM_GATE);
-  dim_xh.batch(1);
-
-  TensorDim dim_hh = output_dim;
-  dim_hh.height(unit);
-  dim_hh.width(unit * NUM_GATE);
-  dim_hh.batch(1);
-
-  // weight_initializer can be set seperately. weight_xh initializer,
+  // weight_initializer can be set seperately. weight_ih initializer,
   // weight_hh initializer kernel initializer & recurrent_initializer in keras
   // for now, it is set same way.
-  wt_idx[GRUParams::weight_xh] =
-    context.requestWeight(dim_xh, weight_initializer, weight_regularizer,
-                          weight_regularizer_constant, "weight_xh", true);
+
+  // - weight_ih ( input to hidden )
+  // weight_ih_dim : [ 1, 1, feature_size, NUMGATE * unit ] -> z, r, g
+  TensorDim weight_ih_dim({feature_size, NUM_GATE * unit});
+  wt_idx[GRUParams::weight_ih] =
+    context.requestWeight(weight_ih_dim, weight_initializer, weight_regularizer,
+                          weight_regularizer_constant, "weight_ih", true);
+  // - weight_hh ( hidden to hidden )
+  // weight_hh_dim : [ 1, 1, unit, NUM_GATE * unit ] -> z, r, g
+  TensorDim weight_hh_dim({unit, NUM_GATE * unit});
   wt_idx[GRUParams::weight_hh] =
-    context.requestWeight(dim_hh, weight_initializer, weight_regularizer,
+    context.requestWeight(weight_hh_dim, weight_initializer, weight_regularizer,
                           weight_regularizer_constant, "weight_hh", true);
-  wt_idx[GRUParams::bias_h] = context.requestWeight(
-    bias_dim, bias_initializer, WeightRegularizer::NONE, 1.0f, "bias_h", true);
-
-  TensorDim d = input_dim;
-  d.width(unit);
+  if (!disable_bias) {
+    if (integrate_bias) {
+      // - bias_h ( input bias, hidden bias are integrate to 1 bias )
+      // bias_h_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+      TensorDim bias_h_dim({NUM_GATE * unit});
+      wt_idx[GRUParams::bias_h] =
+        context.requestWeight(bias_h_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_h", true);
+    } else {
+      // - bias_ih ( input bias )
+      // bias_ih_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+      TensorDim bias_ih_dim({NUM_GATE * unit});
+      wt_idx[GRUParams::bias_ih] =
+        context.requestWeight(bias_ih_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_ih", true);
+      // - bias_hh ( hidden bias )
+      // bias_hh_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+      TensorDim bias_hh_dim({NUM_GATE * unit});
+      wt_idx[GRUParams::bias_hh] =
+        context.requestWeight(bias_hh_dim, bias_initializer,
+                              WeightRegularizer::NONE, 1.0f, "bias_hh", true);
+    }
+  }
 
-  wt_idx[GRUParams::hidden_state] =
-    context.requestTensor(d, "hidden_state", Tensor::Initializer::NONE, true,
-                          TensorLifespan::ITERATION_LIFESPAN);
+  // hidden_state_dim = [ batch, 1, max_timestep, unit ]
+  TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit);
+  wt_idx[GRUParams::hidden_state] = context.requestTensor(
+    hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
+    TensorLifespan::ITERATION_LIFESPAN);
 
-  d.width(unit * NUM_GATE);
+  // zrg_dim = [ batch, 1, max_timestep, NUM_GATE * unit ]
+  TensorDim zrg_dim(batch_size, 1, max_timestep, NUM_GATE * unit);
   wt_idx[GRUParams::zrg] =
-    context.requestTensor(d, "zrg", Tensor::Initializer::NONE, true,
+    context.requestTensor(zrg_dim, "zrg", Tensor::Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
-  TensorDim h_dim = TensorDim();
-  h_dim.setTensorDim(3, unit);
-  h_dim.batch(input_dim.batch());
+  // h_prev_dim = [ batch, 1, 1, unit ]
+  TensorDim h_prev_dim = TensorDim({batch_size, 1, 1, unit});
   wt_idx[GRUParams::h_prev] =
-    context.requestTensor(h_dim, "h_prev", Tensor::Initializer::NONE, false,
-                          TensorLifespan::FORWARD_FUNC_LIFESPAN);
+    context.requestTensor(h_prev_dim, "h_prev", Tensor::Initializer::NONE,
+                          false, TensorLifespan::FORWARD_FUNC_LIFESPAN);
 
-  if (hidden_state_activation_type.get() == ActivationType::ACT_NONE) {
-    hidden_state_activation_type.set(ActivationType::ACT_TANH);
+  if (dropout_rate > epsilon) {
+    TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit);
+    wt_idx[GRUParams::dropout_mask] = context.requestTensor(
+      output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
+      TensorLifespan::ITERATION_LIFESPAN);
   }
-  acti_func.setActiFunc(hidden_state_activation_type.get());
 
-  if (recurrent_activation_type.get() == ActivationType::ACT_NONE) {
-    recurrent_activation_type.set(ActivationType::ACT_SIGMOID);
-  }
-  recurrent_acti_func.setActiFunc(recurrent_activation_type.get());
+  acti_func.setActiFunc(hidden_state_activation_type);
+  recurrent_acti_func.setActiFunc(recurrent_activation_type);
 }
 
 void GRULayer::setProperty(const std::vector<std::string> &values) {
@@ -175,25 +183,45 @@ void GRULayer::exportTo(Exporter &exporter, const ExportMethods &method) const {
 }
 
 void GRULayer::forwarding(RunLayerContext &context, bool training) {
-  auto unit = std::get<props::Unit>(gru_props).get();
-  bool return_sequences = std::get<props::ReturnSequences>(gru_props);
-  float dropout_rate = std::get<props::DropOutRate>(gru_props);
-
-  Tensor &weight_xh = context.getWeight(wt_idx[GRUParams::weight_xh]);
-  Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
-  Tensor &bias_h = context.getWeight(wt_idx[GRUParams::bias_h]);
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(gru_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(gru_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
+  const bool reset_after = std::get<props::ResetAfter>(gru_props).get();
+
+  Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+  const TensorDim &input_dim = input.getDim();
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  const unsigned int feature_size = input_dim.width();
+  Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
 
-  Tensor &hidden_ = context.getTensor(wt_idx[GRUParams::hidden_state]);
-  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+  const Tensor &weight_ih = context.getWeight(wt_idx[GRUParams::weight_ih]);
+  const Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
+  Tensor empty;
+  Tensor &bias_h = !disable_bias && integrate_bias
+                     ? context.getWeight(wt_idx[GRUParams::bias_h])
+                     : empty;
+  Tensor &bias_ih = !disable_bias && !integrate_bias
+                      ? context.getWeight(wt_idx[GRUParams::bias_ih])
+                      : empty;
+  Tensor &bias_hh = !disable_bias && !integrate_bias
+                      ? context.getWeight(wt_idx[GRUParams::bias_hh])
+                      : empty;
+
+  Tensor &hidden_state = context.getTensor(wt_idx[GRUParams::hidden_state]);
   Tensor &zrg = context.getTensor(wt_idx[GRUParams::zrg]);
   Tensor &h_prev = context.getTensor(wt_idx[GRUParams::h_prev]);
-  const TensorDim &input_dim = input_.getDim();
 
-  hidden_.setZero();
+  hidden_state.setZero();
   zrg.setZero();
   h_prev.setZero();
 
-  Tensor hs_prev;
+  Tensor prev_hs;
   Tensor hs;
 
   // zt = sigma(W_hz.h_prev + W_xz.xs)
@@ -201,34 +229,31 @@ void GRULayer::forwarding(RunLayerContext &context, bool training) {
   // gt = tanh((h_prev*rt).W_hr + W_xg.xs)
   // h_nx = (1-zt)*gt + zt*h_prev
 
-  for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-    Tensor islice = input_.getBatchSlice(b, 1);
-    Tensor oslice = hidden_.getBatchSlice(b, 1);
+  for (unsigned int b = 0; b < batch_size; ++b) {
+    Tensor islice = input.getBatchSlice(b, 1);
+    Tensor oslice = hidden_state.getBatchSlice(b, 1);
     Tensor zrg_ = zrg.getBatchSlice(b, 1);
 
-    for (unsigned int t = 0; t < islice.height(); ++t) {
-      Tensor xs =
-        islice.getSharedDataTensor({islice.width()}, t * islice.width());
+    for (unsigned int t = 0; t < max_timestep; ++t) {
+      Tensor xs = islice.getSharedDataTensor({feature_size}, t * feature_size);
 
       /** @todo verify this dropout working */
       // if (dropout_rate > 0.0 && training) {
       //   xs.multiply_i(xs.dropout_mask(dropout_rate));
       // }
-      hs = oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
+      hs = oslice.getSharedDataTensor({unit}, t * unit);
       Tensor zrg_t =
         zrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
 
       if (t > 0) {
-        hs_prev = oslice.getSharedDataTensor({oslice.width()},
-                                             (t - 1) * oslice.width());
+        prev_hs = oslice.getSharedDataTensor({unit}, (t - 1) * unit);
       } else {
-        hs_prev = h_prev.getBatchSlice(b, 1);
+        prev_hs = h_prev.getBatchSlice(b, 1);
       }
 
-      xs.dot(weight_xh, zrg_t); // x_z, x_r, x_g
+      xs.dot(weight_ih, zrg_t); // x_z, x_r, x_g
 
       Tensor ztrt = zrg_t.getSharedDataTensor({unit * 2}, 0);
-      Tensor ztrt_b = bias_h.getSharedDataTensor({unit * 2}, 0);
 
       Tensor w_hh;
       w_hh.copy_with_stride(
@@ -238,123 +263,181 @@ void GRULayer::forwarding(RunLayerContext &context, bool training) {
         weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false));
 
       Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2);
-      Tensor gt_b = bias_h.getSharedDataTensor({unit}, unit * 2);
 
-      ztrt.add_i(hs_prev.dot(w_hh));
-      ztrt.add_i(ztrt_b);
+      ztrt.add_i(prev_hs.dot(w_hh));
+      if (!disable_bias) {
+        if (integrate_bias) {
+          Tensor ztrt_bias_h = bias_h.getSharedDataTensor({unit * 2}, 0);
+          ztrt.add_i(ztrt_bias_h);
+        } else {
+          Tensor ztrt_bias_ih = bias_ih.getSharedDataTensor({unit * 2}, 0);
+          ztrt.add_i(ztrt_bias_ih);
+          Tensor ztrt_bias_hh = bias_hh.getSharedDataTensor({unit * 2}, 0);
+          ztrt.add_i(ztrt_bias_hh);
+        }
+      }
+
+      recurrent_acti_func.run_fn(ztrt, ztrt);
 
       Tensor zt = ztrt.getSharedDataTensor({unit}, 0);
       Tensor rt = ztrt.getSharedDataTensor({unit}, unit);
 
-      recurrent_acti_func.run_fn(rt, rt);
-      recurrent_acti_func.run_fn(zt, zt);
-
       Tensor temp;
-      rt.multiply(hs_prev, temp);
-      gt.add_i(temp.dot(w_g));
-      gt.add_i(gt_b);
+      if (reset_after) {
+        prev_hs.dot(w_g, temp);
+        if (!disable_bias && !integrate_bias) {
+          Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+          temp.add_i(bias_hh_g);
+        }
+        temp.multiply_i(rt);
+        gt.add_i(temp);
+      } else {
+        rt.multiply(prev_hs, temp);
+        temp.dot(w_g, gt, false, false, 1.0f);
+        if (!disable_bias && !integrate_bias) {
+          Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+          gt.add_i(bias_hh_g);
+        }
+      }
+      if (!disable_bias) {
+        if (integrate_bias) {
+          Tensor gt_bias_h = bias_h.getSharedDataTensor({unit}, unit * 2);
+          gt.add_i(gt_bias_h);
+        } else {
+          Tensor gt_bias_ih = bias_ih.getSharedDataTensor({unit}, unit * 2);
+          gt.add_i(gt_bias_ih);
+        }
+      }
+
       acti_func.run_fn(gt, gt);
 
-      zt.multiply(hs_prev, hs);
+      zt.multiply(prev_hs, hs);
       temp = zt.multiply(-1.0).add(1.0);
       hs.add_i(gt.multiply(temp));
 
       if (dropout_rate > epsilon && training) {
         Tensor mask_ = context.getTensor(wt_idx[GRUParams::dropout_mask])
                          .getBatchSlice(b, 1);
-        Tensor msk =
-          mask_.getSharedDataTensor({mask_.width()}, t * mask_.width());
+        Tensor msk = mask_.getSharedDataTensor({unit}, t * unit);
         msk.dropout_mask(dropout_rate);
         hs.multiply_i(msk);
       }
     }
   }
 
-  Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
   if (!return_sequences) {
-    TensorDim d = hidden_.getDim();
-    for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-      Tensor dest = output.getSharedDataTensor({d.width()}, b * d.width());
-      Tensor src = hidden_.getSharedDataTensor(
-        {d.width()}, b * d.width() * d.height() + (d.height() - 1) * d.width());
+    for (unsigned int batch = 0; batch < batch_size; ++batch) {
+      Tensor dest = output.getSharedDataTensor({unit}, batch * unit);
+      Tensor src = hidden_state.getSharedDataTensor(
+        {unit}, batch * unit * max_timestep + (max_timestep - 1) * unit);
       dest.copy(src);
     }
   } else {
-    output.copy(hidden_);
+    output.copy(hidden_state);
   }
 }
 
 void GRULayer::calcDerivative(RunLayerContext &context) {
-  Tensor &derivative_ = context.getTensorGrad(wt_idx[GRUParams::zrg]);
-  Tensor &weight = context.getWeight(wt_idx[GRUParams::weight_xh]);
-  Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+  Tensor &zrg_derivative = context.getTensorGrad(wt_idx[GRUParams::zrg]);
+  Tensor &weight_ih = context.getWeight(wt_idx[GRUParams::weight_ih]);
+  Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
 
-  derivative_.dot(weight, ret_, false, true);
+  zrg_derivative.dot(weight_ih, outgoing_derivative, false, true);
 }
 
 void GRULayer::calcGradient(RunLayerContext &context) {
-  auto unit = std::get<props::Unit>(gru_props).get();
-  bool return_sequences = std::get<props::ReturnSequences>(gru_props);
-  float dropout_rate = std::get<props::DropOutRate>(gru_props);
-
-  Tensor &djdw_x = context.getWeightGrad(wt_idx[GRUParams::weight_xh]);
-  Tensor &djdw_h = context.getWeightGrad(wt_idx[GRUParams::weight_hh]);
-  Tensor &djdb_h = context.getWeightGrad(wt_idx[GRUParams::bias_h]);
+  const bool disable_bias =
+    std::get<props::DisableBias>(*layer_impl_props).get();
+
+  const unsigned int unit = std::get<props::Unit>(gru_props).get();
+  const bool return_sequences =
+    std::get<props::ReturnSequences>(gru_props).get();
+  const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+  const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
+  const bool reset_after = std::get<props::ResetAfter>(gru_props).get();
+
+  Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+  const TensorDim &input_dim = input.getDim();
+  const unsigned int batch_size = input_dim.batch();
+  const unsigned int max_timestep = input_dim.height();
+  const unsigned int feature_size = input_dim.width();
+  Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+
+  Tensor &djdweight_ih = context.getWeightGrad(wt_idx[GRUParams::weight_ih]);
   Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
-
-  Tensor djdw_zr_h = Tensor({1, 1, unit, unit * 2}, true);
-  Tensor djdw_g_h = Tensor({1, 1, unit, unit}, true);
-  Tensor &derivative_ = context.getTensorGrad(wt_idx[GRUParams::hidden_state]);
-  Tensor &hidden_ = context.getTensor(wt_idx[GRUParams::hidden_state]);
-  Tensor &incoming_deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
-  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+  Tensor &djdweight_hh = context.getWeightGrad(wt_idx[GRUParams::weight_hh]);
+  Tensor empty;
+  Tensor &djdbias_h = !disable_bias && integrate_bias
+                        ? context.getWeightGrad(wt_idx[GRUParams::bias_h])
+                        : empty;
+  Tensor &djdbias_ih = !disable_bias && !integrate_bias
+                         ? context.getWeightGrad(wt_idx[GRUParams::bias_ih])
+                         : empty;
+  Tensor &bias_hh = !disable_bias && !integrate_bias
+                      ? context.getWeight(wt_idx[GRUParams::bias_hh])
+                      : empty;
+  Tensor &djdbias_hh = !disable_bias && !integrate_bias
+                         ? context.getWeightGrad(wt_idx[GRUParams::bias_hh])
+                         : empty;
+
+  Tensor djdweight_hh_zr = Tensor({1, 1, unit, unit * 2}, true);
+  Tensor djdweight_hh_g = Tensor({1, 1, unit, unit}, true);
+  Tensor &hidden_state_derivative =
+    context.getTensorGrad(wt_idx[GRUParams::hidden_state]);
+  Tensor &hidden_state = context.getTensor(wt_idx[GRUParams::hidden_state]);
   Tensor &zrg = context.getTensor(wt_idx[GRUParams::zrg]);
   Tensor &d_zrg = context.getTensorGrad(wt_idx[GRUParams::zrg]);
-  const TensorDim &input_dim = input_.getDim();
 
-  djdw_x.setZero();
-  djdw_zr_h.setZero();
-  djdw_g_h.setZero();
-  djdb_h.setZero();
+  djdweight_ih.setZero();
+  djdweight_hh_zr.setZero();
+  djdweight_hh_g.setZero();
+  if (!disable_bias) {
+    if (integrate_bias) {
+      djdbias_h.setZero();
+    } else {
+      djdbias_ih.setZero();
+      djdbias_hh.setZero();
+    }
+  }
 
-  derivative_.setZero();
+  hidden_state_derivative.setZero();
   d_zrg.setZero();
 
   if (!return_sequences) {
-    TensorDim d = derivative_.getDim();
-    for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-      Tensor dest = derivative_.getSharedDataTensor(
-        {d.width()}, b * d.width() * d.height() + (d.height() - 1) * d.width());
+    for (unsigned int batch = 0; batch < batch_size; ++batch) {
+      Tensor dest = hidden_state_derivative.getSharedDataTensor(
+        {unit}, batch * unit * max_timestep + (max_timestep - 1) * unit);
       Tensor src =
-        incoming_deriv.getSharedDataTensor({d.width()}, b * d.width());
+        incoming_derivative.getSharedDataTensor({unit}, batch * unit);
       dest.copy(src);
     }
   } else {
-    derivative_.copy(incoming_deriv);
+    hidden_state_derivative.copy(incoming_derivative);
   }
 
   if (dropout_rate > epsilon) {
-    derivative_.multiply_i(context.getTensor(wt_idx[GRUParams::dropout_mask]));
+    hidden_state_derivative.multiply_i(
+      context.getTensor(wt_idx[GRUParams::dropout_mask]));
   }
 
-  Tensor dh_nx = Tensor({derivative_.width()});
+  Tensor dh_nx = Tensor({unit});
 
-  for (unsigned int b = 0; b < input_dim.batch(); ++b) {
-    Tensor deriv_t = derivative_.getBatchSlice(b, 1);
-    Tensor xs_t = input_.getBatchSlice(b, 1);
-    Tensor hs_t = hidden_.getBatchSlice(b, 1);
+  for (unsigned int b = 0; b < batch_size; ++b) {
+    Tensor deriv_t = hidden_state_derivative.getBatchSlice(b, 1);
+    Tensor xs_t = input.getBatchSlice(b, 1);
+    Tensor hs_t = hidden_state.getBatchSlice(b, 1);
 
     dh_nx.setZero();
 
     Tensor dh;
-    Tensor hs_prev;
+    Tensor prev_hs;
     Tensor xs;
     Tensor dzrg_ = d_zrg.getBatchSlice(b, 1);
     Tensor zrg_ = zrg.getBatchSlice(b, 1);
 
-    for (unsigned int t = deriv_t.height(); t-- > 0;) {
-      dh = deriv_t.getSharedDataTensor({deriv_t.width()}, t * deriv_t.width());
-      xs = xs_t.getSharedDataTensor({xs_t.width()}, t * xs_t.width());
+    for (unsigned int t = max_timestep; t-- > 0;) {
+      dh = deriv_t.getSharedDataTensor({unit}, t * unit);
+      xs = xs_t.getSharedDataTensor({feature_size}, t * feature_size);
 
       Tensor dzrg_t =
         dzrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
@@ -362,13 +445,12 @@ void GRULayer::calcGradient(RunLayerContext &context) {
         zrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
 
       if (t == 0) {
-        hs_prev = Tensor({hs_t.width()});
-        hs_prev.setZero();
+        prev_hs = Tensor({unit});
+        prev_hs.setZero();
       } else {
-        hs_prev =
-          hs_t.getSharedDataTensor({hs_t.width()}, (t - 1) * hs_t.width());
+        prev_hs = hs_t.getSharedDataTensor({unit}, (t - 1) * unit);
       }
-      if (t < deriv_t.height() - 1) {
+      if (t < max_timestep - 1) {
         dh.add_i(dh_nx);
       }
 
@@ -380,9 +462,8 @@ void GRULayer::calcGradient(RunLayerContext &context) {
       Tensor rt = zrg_t.getSharedDataTensor({unit}, unit);
       Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2);
 
-      zt.multiply(dh, dh_nx); // dh_nx = d1
-
-      dh.multiply(hs_prev, dhz);       // dhz = d2
+      zt.multiply(dh, dh_nx);          // dh_nx = d1
+      dh.multiply(prev_hs, dhz);       // dhz = d2
       dhz.subtract_i(gt.multiply(dh)); // dhz = d5
       zt.multiply(-1.0, dhg);
       dhg.add_i(1.0);
@@ -400,34 +481,69 @@ void GRULayer::calcGradient(RunLayerContext &context) {
       wzr_hh.copy_with_stride(
         weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false));
 
-      Tensor temp = Tensor({hs_t.width()});
-      temp.setZero();
-      dhg.dot(wg_hh, temp, false, true); // temp = d10
-      hs_prev.multiply(temp, dhr);       // dhr = d15
-      temp.multiply_i(rt);               // temp=d14
-      dh_nx.add_i(temp);                 //  dh_nx = d1 + d14
-      // reset temp : hs_prev * rt for djdw_g_h
-      hs_prev.multiply(rt, temp);
-      recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
+      Tensor temp = Tensor({unit});
+
+      if (reset_after) {
+        prev_hs.dot(wg_hh, temp);
+        if (!disable_bias && !integrate_bias) {
+          const Tensor bias_hh_g =
+            bias_hh.getSharedDataTensor({unit}, 2 * unit);
+          temp.add_i(bias_hh_g);
+        }
+        dhg.multiply(temp, dhr);
+
+        // reset temp: dhg * rt for djdbias_hh_g, dh_nx and djdweight_hh_g
+        dhg.multiply(rt, temp);
+        if (!disable_bias && !integrate_bias) {
+          Tensor djdbias_hh_g =
+            djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+          djdbias_hh_g.add_i(temp);
+        }
+        temp.dot(wg_hh, dh_nx, false, true, 1.0f); // dh_nx = d1 + d14
+        djdweight_hh_g.add_i(prev_hs.dot(temp, true, false));
+      } else {
+        if (!disable_bias && !integrate_bias) {
+          Tensor djdbias_hh_g =
+            djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+          djdbias_hh_g.add_i(dhg);
+        }
+
+        dhg.dot(wg_hh, temp, false, true); // temp = d10
+        temp.multiply(prev_hs, dhr);       // dhr = d15s
+        temp.multiply_i(rt);               // temp=d14
+        dh_nx.add_i(temp);                 //  dh_nx = d1 + d14
+
+        // reset temp : prev_hs * rt for djdweight_hh_g
+        rt.multiply(prev_hs, temp);
+        temp.dot(dhg, djdweight_hh_g, true, false, 1.0f);
+      }
 
-      djdb_h.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+      recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
 
-      djdw_x.add_i(xs.dot(dzrg_t, true, false));
+      if (!disable_bias) {
+        if (integrate_bias) {
+          djdbias_h.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+        } else {
+          djdbias_ih.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+          Tensor djdbias_hh_zr = djdbias_hh.getSharedDataTensor({2 * unit}, 0);
+          djdbias_hh_zr.add_i(dzrg_t.getSharedDataTensor({2 * unit}, 0));
+        }
+      }
 
-      djdw_zr_h.add_i(hs_prev.dot(dhzr, true, false));
-      djdw_g_h.add_i(temp.dot(dhg, true, false));
+      djdweight_hh_zr.add_i(prev_hs.dot(dhzr, true, false));
+      xs.dot(dzrg_t, djdweight_ih, true, false, 1.0f);
       dhzr.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17
     }
   }
   for (unsigned int h = 0; h < unit; ++h) {
-    float *data = djdw_zr_h.getAddress(h * unit * 2);
-    float *rdata = djdw_h.getAddress(h * unit * NUM_GATE);
+    float *data = djdweight_hh_zr.getAddress(h * unit * 2);
+    float *rdata = djdweight_hh.getAddress(h * unit * NUM_GATE);
     std::copy(data, data + unit * 2, rdata);
   }
 
   for (unsigned int h = 0; h < unit; ++h) {
-    float *data = djdw_g_h.getAddress(h * unit);
-    float *rdata = djdw_h.getAddress(h * unit * NUM_GATE + unit * 2);
+    float *data = djdweight_hh_g.getAddress(h * unit);
+    float *rdata = djdweight_hh.getAddress(h * unit * NUM_GATE + unit * 2);
     std::copy(data, data + unit, rdata);
   }
 }
index c66c7ae..c3372c4 100644 (file)
@@ -106,13 +106,15 @@ private:
    * RecurrentActivation: activation type for recurrent. default is sigmoid
    * ReturnSequence: option for return sequence
    * DropOutRate: dropout rate
+   * IntegrateBias: integrate bias_ih, bias_hh to bias_h
+   * ResetAfter: Whether apply reset gate before/after the matrix
    *
    * */
   std::tuple<props::Unit, props::HiddenStateActivation,
              props::RecurrentActivation, props::ReturnSequences,
-             props::DropOutRate>
+             props::DropOutRate, props::IntegrateBias, props::ResetAfter>
     gru_props;
-  std::array<unsigned int, 7> wt_idx; /**< indices of the weights */
+  std::array<unsigned int, 9> wt_idx; /**< indices of the weights */
 
   /**
    * @brief     activation function for h_t : default is sigmoid
index 1c1d5e0..94896d3 100644 (file)
@@ -60,7 +60,7 @@ GRUCellLayer::GRUCellLayer() :
                 props::HiddenStateActivation() = ActivationType::ACT_TANH,
                 props::RecurrentActivation() = ActivationType::ACT_SIGMOID,
                 props::DropOutRate(), props::IntegrateBias(),
-                props::MaxTimestep(), props::Timestep()),
+                props::ResetAfter(), props::MaxTimestep(), props::Timestep()),
   acti_func(ActivationType::ACT_NONE, true),
   recurrent_acti_func(ActivationType::ACT_NONE, true),
   epsilon(1e-3) {
@@ -191,6 +191,7 @@ void GRUCellLayer::forwarding(RunLayerContext &context, bool training) {
   const float dropout_rate = std::get<props::DropOutRate>(grucell_props).get();
   const bool integrate_bias =
     std::get<props::IntegrateBias>(grucell_props).get();
+  const bool reset_after = std::get<props::ResetAfter>(grucell_props).get();
   const unsigned int max_timestep =
     std::get<props::MaxTimestep>(grucell_props).get();
   const unsigned int timestep = std::get<props::Timestep>(grucell_props).get();
@@ -262,13 +263,22 @@ void GRUCellLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor r_gate = zr_gate.getSharedDataTensor({batch_size, unit}, unit, false);
 
   Tensor temp;
-  prev_hidden_state.dot(weight_hh_g, temp);
-  if (!disable_bias && !integrate_bias) {
-    Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
-    temp.add_i(bias_hh_g);
+  if (reset_after) {
+    prev_hidden_state.dot(weight_hh_g, temp);
+    if (!disable_bias && !integrate_bias) {
+      Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+      temp.add_i(bias_hh_g);
+    }
+    temp.multiply_i_strided(r_gate);
+    g_gate.add_i_strided(temp);
+  } else {
+    r_gate.multiply_strided(prev_hidden_state, temp);
+    temp.dot(weight_hh_g, g_gate, false, false, 1.0f);
+    if (!disable_bias && !integrate_bias) {
+      Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+      g_gate.add_i(bias_hh_g);
+    }
   }
-  temp.multiply_i_strided(r_gate);
-  g_gate.add_i_strided(temp);
   if (!disable_bias) {
     if (integrate_bias) {
       Tensor bias_h_g = bias_h.getSharedDataTensor({unit}, 2 * unit);
@@ -321,16 +331,17 @@ void GRUCellLayer::calcGradient(RunLayerContext &context) {
   const float dropout_rate = std::get<props::DropOutRate>(grucell_props).get();
   const bool integrate_bias =
     std::get<props::IntegrateBias>(grucell_props).get();
+  const bool reset_after = std::get<props::ResetAfter>(grucell_props).get();
   const unsigned int max_timestep =
     std::get<props::MaxTimestep>(grucell_props).get();
   const unsigned int timestep = std::get<props::Timestep>(grucell_props).get();
 
-  Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+  const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
   const unsigned int batch_size = input.getDim().batch();
 
   Tensor &djdweight_ih =
     context.getWeightGrad(wt_idx[GRUCellParams::weight_ih]);
-  Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]);
+  const Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]);
   Tensor &djdweight_hh =
     context.getWeightGrad(wt_idx[GRUCellParams::weight_hh]);
 
@@ -341,9 +352,9 @@ void GRUCellLayer::calcGradient(RunLayerContext &context) {
   Tensor &djdbias_ih = !disable_bias && !integrate_bias
                          ? context.getWeightGrad(wt_idx[GRUCellParams::bias_ih])
                          : empty;
-  Tensor &bias_hh = !disable_bias && !integrate_bias
-                      ? context.getWeight(wt_idx[GRUCellParams::bias_hh])
-                      : empty;
+  const Tensor &bias_hh = !disable_bias && !integrate_bias
+                            ? context.getWeight(wt_idx[GRUCellParams::bias_hh])
+                            : empty;
   Tensor &djdbias_hh = !disable_bias && !integrate_bias
                          ? context.getWeightGrad(wt_idx[GRUCellParams::bias_hh])
                          : empty;
@@ -355,7 +366,6 @@ void GRUCellLayer::calcGradient(RunLayerContext &context) {
   Tensor &hidden_states =
     context.getTensor(wt_idx[GRUCellParams::hidden_state]);
   hidden_states.reshape({max_timestep, 1, batch_size, unit});
-  Tensor hidden_state = hidden_states.getBatchSlice(timestep, 1);
   Tensor &hidden_states_derivatives =
     context.getTensorGrad(wt_idx[GRUCellParams::hidden_state]);
   Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
@@ -441,20 +451,37 @@ void GRUCellLayer::calcGradient(RunLayerContext &context) {
   Tensor temp = Tensor(batch_size, unit);
   Tensor dhg_;
   dhg_.copy_with_stride(dhg);
-  prev_hidden_state.dot(wg_hh, temp);
-  if (!disable_bias && !integrate_bias) {
-    Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
-    temp.add_i(bias_hh_g);
-  }
-  dhg_.multiply_strided(temp, dhr); // dhr = d15
 
-  // reset temp : prev_hidden_state * rt for djdbias_hh_g and dh_nx
-  dhg_.multiply_strided(rt, temp);
-  if (!disable_bias && !integrate_bias) {
-    Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
-    temp.sum(2, djdbias_hh_g, 1.0, 1.0);
+  if (reset_after) {
+    prev_hidden_state.dot(wg_hh, temp);
+    if (!disable_bias && !integrate_bias) {
+      const Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+      temp.add_i(bias_hh_g);
+    }
+    dhg_.multiply_strided(temp, dhr); // dhr = d15
+
+    // reset temp: dhg_ * rt for djdbias_hh_g, dh_nx and djdweight_hh_g
+    dhg_.multiply_strided(rt, temp);
+    if (!disable_bias && !integrate_bias) {
+      Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+      temp.sum(2, djdbias_hh_g, 1.0, 1.0);
+    }
+    temp.dot(wg_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14
+    djdweight_hh_g.add_i_strided(prev_hidden_state.dot(temp, true, false));
+  } else {
+    if (!disable_bias && !integrate_bias) {
+      Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+      dhg.sum(2, djdbias_hh_g, 1.0, 1.0);
+    }
+
+    dhg_.dot(wg_hh, temp, false, true);
+    temp.multiply_strided(prev_hidden_state, dhr);
+    temp.multiply_strided(rt, dh_nx, 1.0f);
+
+    // reset temp: rt * prev_hidden_state for and djdweight_hh_g
+    rt.multiply_strided(prev_hidden_state, temp);
+    temp.dot(dhg_, djdweight_hh_g, true, false, 1.0f);
   }
-  temp.dot(wg_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14
 
   recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
 
@@ -469,12 +496,10 @@ void GRUCellLayer::calcGradient(RunLayerContext &context) {
     }
   }
 
-  djdweight_ih.add_i(input.dot(zrg_gate_derivative, true, false));
-
   Tensor dhzr_;
   dhzr_.copy_with_stride(dhzr);
   djdweight_hh_zr.add_i_strided(prev_hidden_state.dot(dhzr_, true, false));
-  djdweight_hh_g.add_i_strided(prev_hidden_state.dot(temp, true, false));
+  input.dot(zrg_gate_derivative, djdweight_ih, true, false, 1.0f);
   dhzr_.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17
 }
 
index b3c30f0..d02281c 100644 (file)
@@ -106,13 +106,16 @@ private:
    * RecurrentActivation: activation type for recurrent. default is sigmoid
    * DropOutRate: dropout rate
    * IntegrateBias: integrate bias_ih, bias_hh to bias_h
+   * ResetAfter: Whether apply reset gate before/after the matrix
+   * multiplication. Apply reset gate after the mulplication if true.
    * MaxTimeStep: Maximum timestep of gru
    * TimeStep: timestep for which gru should operate
    *
    * */
   std::tuple<props::Unit, props::HiddenStateActivation,
              props::RecurrentActivation, props::DropOutRate,
-             props::IntegrateBias, props::MaxTimestep, props::Timestep>
+             props::IntegrateBias, props::ResetAfter, props::MaxTimestep,
+             props::Timestep>
     grucell_props;
   std::array<unsigned int, 9> wt_idx; /**< indices of the weights */
 
index d6b19f8..dc5dcd3 100644 (file)
Binary files a/packaging/unittest_layers_v2.tar.gz and b/packaging/unittest_layers_v2.tar.gz differ
index 54b9a5b..8f1b98a 100644 (file)
Binary files a/packaging/unittest_models.tar.gz and b/packaging/unittest_models.tar.gz differ
index d132423..fbf513d 100644 (file)
@@ -119,29 +119,59 @@ if __name__ == "__main__":
                          return_state=False)
     record_single(lstm, (3, 4, 7), "lstm_multi_step_seq_act")
 
-    gru = K.layers.GRU(units=5, reset_after=False,
+    gru = K.layers.GRU(units=5, activation="tanh", 
                          recurrent_activation="sigmoid",
-                         activation="tanh",
+                         bias_initializer='GlorotUniform',
                          return_sequences=False,
-                         return_state=False)
+                         return_state=False,
+                         reset_after=False)
     record_single(gru, (3, 1, 7), "gru_single_step")
     record_single(gru, (3, 4, 7), "gru_multi_step")
 
-    gru = K.layers.GRU(units=5, reset_after=False,
+    gru = K.layers.GRU(units=5, activation="tanh", 
                          recurrent_activation="sigmoid",
-                         activation="tanh",
+                         bias_initializer='GlorotUniform',
                          return_sequences=True,
-                         return_state=False)
+                         return_state=False,
+                         reset_after=False)
     record_single(gru, (3, 1, 7), "gru_single_step_seq")
     record_single(gru, (3, 4, 7), "gru_multi_step_seq", input_type='float')
 
-    gru = K.layers.GRU(units=5, reset_after=False,
+    gru = K.layers.GRU(units=5, activation="sigmoid", 
                          recurrent_activation="tanh",
-                         activation="sigmoid",
+                         bias_initializer='GlorotUniform',
                          return_sequences=True,
-                         return_state=False)
+                         return_state=False,
+                         reset_after=False,)
     record_single(gru, (3, 4, 7), "gru_multi_step_seq_act", input_type='float')
 
+    # check reset_after
+    gru = K.layers.GRU(units=5, activation="tanh", 
+                         recurrent_activation="sigmoid",
+                         bias_initializer='GlorotUniform',
+                         return_sequences=False,
+                         return_state=False,
+                         reset_after=True,)
+    record_single(gru, (3, 1, 7), "gru_reset_after_single_step")
+    record_single(gru, (3, 4, 7), "gru_reset_after_multi_step")
+
+    gru = K.layers.GRU(units=5, activation="tanh", 
+                         recurrent_activation="sigmoid",
+                         bias_initializer='GlorotUniform',
+                         return_sequences=True,
+                         return_state=False,
+                         reset_after=True)
+    record_single(gru, (3, 1, 7), "gru_reset_after_single_step_seq")
+    record_single(gru, (3, 4, 7), "gru_reset_after_multi_step_seq", input_type='float')
+
+    gru = K.layers.GRU(units=5, activation="sigmoid", 
+                         recurrent_activation="tanh",
+                         bias_initializer='GlorotUniform',
+                         return_sequences=True,
+                         return_state=False,
+                         reset_after=True)
+    record_single(gru, (3, 4, 7), "gru_reset_after_multi_step_seq_act", input_type='float')
+
     dropout = K.layers.Dropout(rate=0.2)
     record_single(dropout, (2, 3, 2, 3), "dropout_20_training", {"training": True})
     record_single(dropout, (2, 3, 2, 3), "dropout_20_inference", {"training": False})
index 8472141..a91ab1f 100644 (file)
@@ -41,608 +41,619 @@ opt = tf.keras.optimizers
 
 if __name__ == "__main__":
 
-    def multiout_test():
-        # x -> [a, b] -> c
-        x = K.Input(shape=(2, 3, 5), name="x")
-        # because the sort order is x -> [b, a] -> c, b0 must out first.
-        b0, a0 = MultiOutLayer(num_output=2)(x)
-        a1 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=3, strides=2, padding="same", name="multiout_a1"
-            )
-        )(a0)
-        a2 = K.layers.Activation("relu", name="multiout_a2")(a1)
-        a3 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=3, padding="same", name="multiout_a3"
-            )
-        )(a2)
-        a4 = K.layers.Flatten(name="multiout_a4")(a3)
-        a5 = K.layers.Dense(10, name="multiout_a5")(a4)
-        a6 = K.layers.Activation("softmax", name="multiout_a6")(a5)
-        b1 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=1, strides=2, padding="same", name="multiout_b1"
-            )
-        )(b0)
-        b2 = K.layers.Flatten(name="multiout_b2")(b1)
-        b3 = K.layers.Dense(10, name="multiout_b3")(b2)
-        b4 = K.layers.Activation("softmax", name="multiout_b4")(b3)
-
-        return x, [x, b0, b1, b2, b3, b4, a0, a1, a2, a3, a4, a5, a6]
-
-    x, y = multiout_test()
-    record(
-        loss_fn_str="mse",
-        file_name="multiple_output_model.info",
-        input_shape=(3, 2, 3, 5),
-        label_shape=(3, 10),
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        inputs=x,
-        outputs=y,
-        multi_out=[5, 12],
-        # debug=["name", "summary", "output", "initial_weights"],
-    )
-
-    ## please generate all test cases since golden data format can change anytime
-    fc_sigmoid = [
-        K.Input(shape=(3, 3)),
-        K.layers.Dense(5),
-        K.layers.Activation("sigmoid"),
-        K.layers.Dense(10),
-        K.layers.Activation("softmax"),
-    ]
-
-    fc_sigmoid_tc = partial(
-        record,
-        model=fc_sigmoid,
-        input_shape=(3, 3),
-        label_shape=(3, 10),
-        iteration=10,
-        optimizer=opt.SGD(learning_rate=1.0),
-    )
-
-    fc_sigmoid_tc(file_name="fc_sigmoid_mse.info", loss_fn_str="mse")
-
-    fc_sigmoid_tc(
-        file_name="fc_sigmoid_cross.info", loss_fn_str="cross_softmax",
-    )
-
-    fc_relu = [
-        K.Input(shape=(3)),
-        K.layers.Dense(10),
-        K.layers.Activation("relu"),
-        K.layers.Dense(2),
-        K.layers.Activation("sigmoid"),
-    ]
-
-    fc_relu_tc = partial(
-        record, model=fc_relu, input_shape=(3, 3), label_shape=(3, 2), iteration=10
-    )
-
-    fc_relu_tc(
-        file_name="fc_relu_mse.info",
-        loss_fn_str="mse",
-        optimizer=opt.SGD(learning_rate=0.1),
-    )
-
-    fc_bn_sigmoid = [
-        K.Input(shape=(3)),
-        K.layers.Dense(10),
-        K.layers.BatchNormalization(),
-        K.layers.Activation("sigmoid"),
-        K.layers.Dense(10),
-        K.layers.Activation("softmax"),
-    ]
-
-    fc_bn_sigmoid_tc = partial(
-        record,
-        model=fc_bn_sigmoid,
-        input_shape=(3, 3),
-        label_shape=(3, 10),
-        optimizer=opt.SGD(learning_rate=1),
-        iteration=10,
-    )
-
-    fc_bn_sigmoid_tc(
-        file_name="fc_bn_sigmoid_cross.info",
-        loss_fn_str="cross_softmax",
-        # debug=["summary", "iteration", "weights"],
-    )
-
-    fc_bn_sigmoid_tc(
-        file_name="fc_bn_sigmoid_mse.info", loss_fn_str="mse",
-    )
-
-    _mnist_block = lambda filter_size: [
-        K.layers.Conv2D(filters=filter_size, kernel_size=(3, 4)),
-        K.layers.Activation("sigmoid"),
-        K.layers.AveragePooling2D(pool_size=(2, 2)),
-    ]
-
-    mnist_conv = [
-        K.Input(shape=(2, 4, 5)),
-        *_mnist_block(2),
-        K.layers.Flatten(),
-        K.layers.Dense(10),
-        K.layers.Activation("softmax"),
-    ]
-
-    mnist_conv_tc = partial(
-        record, model=mnist_conv, optimizer=opt.SGD(learning_rate=0.1), iteration=10,
-    )
-
-    mnist_conv_tc(
-        input_shape=(3, 2, 4, 5),
-        label_shape=(3, 10),
-        file_name="mnist_conv_cross.info",
-        loss_fn_str="cross_softmax",
-        # debug=["summary", "loss", "layer_name", "initial_weights"],
-    )
-
-    mnist_conv_tc(
-        input_shape=(1, 2, 4, 5),
-        label_shape=(1, 10),
-        file_name="mnist_conv_cross_one_input.info",
-        loss_fn_str="cross_softmax",
-        # debug=["summary", "loss", "layer_name", "initial_weights"],
-    )
-
-    conv_nxn_model = lambda kernel_size: [
-        K.Input(shape=(2, 4, 5)),
-        K.layers.Conv2D(filters=4, kernel_size=kernel_size),
-        K.layers.Activation("sigmoid"),
-        K.layers.Flatten(),
-        K.layers.Dense(10),
-        K.layers.Activation("softmax"),
-    ]
-
-    conv_nxn_tc = partial(
-        record,
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 4, 5),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-    )
-
-    # 1x1 kernel size
-    conv_nxn_tc(
-        model=conv_nxn_model((1, 1)), file_name="conv_1x1.info",
-    )
-
-    # height width is same as input size
-    conv_nxn_tc(
-        model=conv_nxn_model((4, 5)), file_name="conv_input_matches_kernel.info"
-    )
-
-    conv_layer_tc = lambda **conv_args: partial(
-        record,
-        model=[
-            K.Input(shape=(2, 5, 3)),
-            K.layers.Conv2D(filters=4, kernel_size=(3, 3), **conv_args),
-            K.layers.Activation("sigmoid"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 5, 3),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-    )
-
-    conv_layer_tc()(file_name="conv_basic.info")
-    conv_layer_tc(padding="same")(file_name="conv_same_padding.info")  # padding: 1, 1
-    conv_layer_tc(strides=(2, 2))(file_name="conv_multi_stride.info")
-    conv_layer_tc(padding="same", strides=(2, 2))(  # padding: 1, 1
-        file_name="conv_same_padding_multi_stride.info"
-    )
-
-    conv_layer_tc(strides=(3, 3))(file_name="conv_uneven_strides.info")
-
-    record(
-        file_name="conv_uneven_strides2.info",
-        model=[
-            K.Input(shape=(2, 4, 4)),
-            K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(1, 2)),
-            K.layers.Activation("sigmoid"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 4, 4),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-        # debug="summary"
-    )
-
-    record(
-        file_name="conv_uneven_strides3.info",
-        model=[
-            K.Input(shape=(2, 4, 4)),
-            K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(2, 1)),
-            K.layers.Activation("sigmoid"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 4, 4),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-    )
-
-    record(
-        file_name="conv_bn.info",
-        model=[
-            K.Input(shape=(2, 3, 5)),
-            K.layers.Conv2D(filters=2, kernel_size=(2, 2)),
-            K.layers.BatchNormalization(),
-            K.layers.Activation("relu"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 3, 5),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-        # debug=["summary", "initial_weights"]
-    )
-
-    pool_layer_tc = lambda pool_layer: partial(
-        record,
-        model=[
-            K.Input(shape=(2, 5, 3)),
-            pool_layer,
-            K.layers.Activation("sigmoid"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 5, 3),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-    )
-
-    pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="same"))(
-        file_name="pooling_max_same_padding.info",  # debug="output"
-    )  # padding: 1, 1
-
-    pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="valid"))(
-        file_name="pooling_max_valid_padding.info",  # debug="output"
-    )  # padding: 1, 1
-
-    pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="same"))(
-        file_name="pooling_avg_same_padding.info",  # debug="dx"
-    )  # padding: 1, 1
-
-    pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="valid"))(
-        file_name="pooling_avg_valid_padding.info",  # debug="dx"
-    )
-
-    pool_layer_tc(K.layers.GlobalAvgPool2D(data_format="channels_first"))(
-        file_name="pooling_global_avg.info",  # debug="summary"
-    )
-
-    pool_layer_tc(K.layers.GlobalMaxPool2D(data_format="channels_first"))(
-        file_name="pooling_global_max.info",  # debug="dx"
-    )
-
-    pool_layer_tc2 = lambda pool_layer: partial(
-        record,
-        model=[
-            K.Input(shape=(2, 3, 5)),
-            pool_layer,
-            K.layers.Activation("sigmoid"),
-            K.layers.Flatten(),
-            K.layers.Dense(10),
-            K.layers.Activation("softmax"),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(3, 2, 3, 5),
-        label_shape=(3, 10),
-        loss_fn_str="cross_softmax",
-    )
-
-    pool_layer_tc2(K.layers.MaxPooling2D(pool_size=3, strides=2, padding="same"))(
-        file_name="pooling_max_same_padding_multi_stride.info",  # debug="dx"
-    )
-
-    pool_layer_tc2(K.layers.AveragePooling2D(pool_size=3, strides=2, padding="same"))(
-        file_name="pooling_avg_same_padding_multi_stride.info",  # debug="output"
-    )
-
-    def addition_test():
-        # x -> [a, b] -> c
-        x = K.Input(shape=(2, 3, 5), name="x")
-        # because the sort order is x -> [b, a] -> c, b0 must out first.
-        b0, a0 = MultiOutLayer(num_output=2)(x)
-        a1 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=3, strides=2, padding="same", name="addition_a1"
-            )
-        )(a0)
-        a2 = K.layers.Activation("relu", name="addition_a2")(a1)
-        a3 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=3, padding="same", name="addition_a3"
-            )
-        )(a2)
-        b1 = TL(
-            K.layers.Conv2D(
-                filters=4, kernel_size=1, strides=2, padding="same", name="addition_b1"
-            )
-        )(b0)
-        c1 = K.layers.Add(name="addition_c1")([a3, b1])
-        c2 = K.layers.Flatten(name="addition_c2")(c1)
-        c3 = K.layers.Dense(10, name="addition_c3")(c2)
-        c4 = K.layers.Activation("softmax", name="addition_c4")(c3)
-
-        return x, [x, b0, b1, a0, a1, a2, a3, c1, c2, c3, c4]
-
-    x, y = addition_test()
-    record(
-        loss_fn_str="mse",
-        file_name="addition_resnet_like.info",
-        input_shape=(3, 2, 3, 5),
-        label_shape=(3, 10),
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        inputs=x,
-        outputs=y,
-        # debug=["name", "summary", "output", "initial_weights"],
-    )
-
-
-    def resnet18(num_class, input_shape):
-        def block(x, filters, kernel_size, downsample = False):
-            # because the sort order is x -> [b, a] -> c, b0 must out first.
-            b0, a0 = MultiOutLayer(num_output=2)(x)
-            a1 = TL(K.layers.Conv2D(kernel_size=kernel_size,
-                    strides= (1 if not downsample else 2),
-                    filters=filters,
-                    padding="same"))(a0)
-            a2 = TL(K.layers.BatchNormalization())(a1)
-            a3 = TL(K.layers.ReLU())(a2)
-            a4 = TL(K.layers.Conv2D(kernel_size=kernel_size,
-                    strides=1,
-                    filters=filters,
-                    padding="same"))(a3)
-
-            if downsample:
-                b1 = TL(K.layers.Conv2D(kernel_size=1,
-                        strides=2,
-                        filters=filters,
-                        padding="same"))(b0)
-            else:
-                b1 = b0
-            o1 = K.layers.Add()([a4, b1])
-            o2 = TL(K.layers.BatchNormalization())(o1)
-            o3 = K.layers.Activation("relu")(o2)
-
-            if (downsample):
-                ret_array = [a0, a1, a2, a3, a4, b0, b1, o1, o2, o3]
-            else:
-                ret_array = [a0, a1, a2, a3, a4, b0, o1, o2, o3]
-            return ret_array
-
-
-        # x -> [a, b] -> c
-        x = K.Input(shape=input_shape, name="x")
-        out_nodes = [x]
-        # initial section of resnet
-        conv0 = TL(K.layers.Conv2D(
-                filters=64, kernel_size=3, strides=1, padding="same"))
-        bn0 = TL(K.layers.BatchNormalization())
-        act0 = K.layers.Activation("relu")
-
-        out_nodes.append(conv0(out_nodes[-1]))
-        out_nodes.append(bn0(out_nodes[-1]))
-        out_nodes.append(act0(out_nodes[-1]))
-
-        # Add all the resnet blocks
-        out_nodes.extend(block(out_nodes[-1], 64, 3, False))
-        out_nodes.extend(block(out_nodes[-1], 64, 3, False))
-        out_nodes.extend(block(out_nodes[-1], 128, 3, True))
-        out_nodes.extend(block(out_nodes[-1], 128, 3, False))
-        out_nodes.extend(block(out_nodes[-1], 256, 3, True))
-        out_nodes.extend(block(out_nodes[-1], 256, 3, False))
-        out_nodes.extend(block(out_nodes[-1], 512, 3, True))
-        out_nodes.extend(block(out_nodes[-1], 512, 3, False))
-
-        # add the suffix part
-        pool0 = TL(K.layers.AveragePooling2D(pool_size=4))
-        flat0 = K.layers.Flatten()
-        dense0 = K.layers.Dense(num_class)
-        sm0 = K.layers.Activation("softmax")
-
-        out_nodes.append(pool0(out_nodes[-1]))
-        out_nodes.append(flat0(out_nodes[-1]))
-        out_nodes.append(dense0(out_nodes[-1]))
-        out_nodes.append(sm0(out_nodes[-1]))
-
-        return x, out_nodes
-
-    x, y = resnet18(100, (3,32,32))
-    record(
-        loss_fn_str="cross_softmax",
-        file_name="ResNet18.info",
-        input_shape=(2, 3, 32, 32),
-        label_shape=(2, 100),
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=2,
-        inputs=x,
-        outputs=y,
-        record_only_outputs=True
-        # debug=["file_shape_generation", "name"],
-    )
-
-    lstm_layer_tc = lambda batch, time, return_sequences: partial(
-        record,
-        model=[
-            K.Input(shape=(time, 1)),
-            K.layers.LSTM(
-                time,
-                recurrent_activation="sigmoid",
-                activation="tanh",
-                return_sequences=return_sequences,
-            ),
-            K.layers.Dense(1),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(batch, time, 1),
-        label_shape=(batch, time, 1),
-        is_onehot=False,
-        loss_fn_str="mse",
-    )
-
-    lstm_layer_tc(1, 1, False)(file_name="lstm_basic.info")
-    lstm_layer_tc(1, 2, True)(file_name="lstm_return_sequence.info")
-    lstm_layer_tc(2, 2, True)(file_name="lstm_return_sequence_with_batch.info")
-
-    multi_lstm_layer_tc = lambda batch, time: partial(
-        record,
-        model=[
-            K.Input(batch_shape=(batch, time, 1)),
-            K.layers.LSTM(
-                time,
-                recurrent_activation="sigmoid",
-                activation="tanh",
-                return_sequences=True,
-            ),
-            K.layers.LSTM(time, recurrent_activation="sigmoid", activation="tanh"),
-            K.layers.Dense(1),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(batch, time, 1),
-        label_shape=(batch, 1),
-        is_onehot=False,
-        loss_fn_str="mse",
-    )
-    multi_lstm_layer_tc(1,2)(file_name="multi_lstm_return_sequence.info")
-    multi_lstm_layer_tc(2,2)(file_name="multi_lstm_return_sequence_with_batch.info")
-
-    rnn_layer_tc = lambda batch, time, return_sequences: partial(
-        record,
-        model=[
-            K.Input(shape=(time, 1)),
-            K.layers.SimpleRNN(2, return_sequences=return_sequences),
-            K.layers.Dense(1),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(batch, time, 1),
-        label_shape=(batch, time, 1),
-        is_onehot=False,
-        loss_fn_str="mse",
-    )
-    rnn_layer_tc(1, 1, False)(file_name="rnn_basic.info")
-    rnn_layer_tc(1, 2, True)(file_name="rnn_return_sequences.info")
-    rnn_layer_tc(2, 2, True)(file_name="rnn_return_sequence_with_batch.info")
-
-    multi_rnn_layer_tc = lambda batch, time: partial(
+    # def multiout_test():
+    #     # x -> [a, b] -> c
+    #     x = K.Input(shape=(2, 3, 5), name="x")
+    #     # because the sort order is x -> [b, a] -> c, b0 must out first.
+    #     b0, a0 = MultiOutLayer(num_output=2)(x)
+    #     a1 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=3, strides=2, padding="same", name="multiout_a1"
+    #         )
+    #     )(a0)
+    #     a2 = K.layers.Activation("relu", name="multiout_a2")(a1)
+    #     a3 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=3, padding="same", name="multiout_a3"
+    #         )
+    #     )(a2)
+    #     a4 = K.layers.Flatten(name="multiout_a4")(a3)
+    #     a5 = K.layers.Dense(10, name="multiout_a5")(a4)
+    #     a6 = K.layers.Activation("softmax", name="multiout_a6")(a5)
+    #     b1 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=1, strides=2, padding="same", name="multiout_b1"
+    #         )
+    #     )(b0)
+    #     b2 = K.layers.Flatten(name="multiout_b2")(b1)
+    #     b3 = K.layers.Dense(10, name="multiout_b3")(b2)
+    #     b4 = K.layers.Activation("softmax", name="multiout_b4")(b3)
+
+    #     return x, [x, b0, b1, b2, b3, b4, a0, a1, a2, a3, a4, a5, a6]
+
+    # x, y = multiout_test()
+    # record(
+    #     loss_fn_str="mse",
+    #     file_name="multiple_output_model.info",
+    #     input_shape=(3, 2, 3, 5),
+    #     label_shape=(3, 10),
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     inputs=x,
+    #     outputs=y,
+    #     multi_out=[5, 12],
+    #     # debug=["name", "summary", "output", "initial_weights"],
+    # )
+
+    # ## please generate all test cases since golden data format can change anytime
+    # fc_sigmoid = [
+    #     K.Input(shape=(3, 3)),
+    #     K.layers.Dense(5),
+    #     K.layers.Activation("sigmoid"),
+    #     K.layers.Dense(10),
+    #     K.layers.Activation("softmax"),
+    # ]
+
+    # fc_sigmoid_tc = partial(
+    #     record,
+    #     model=fc_sigmoid,
+    #     input_shape=(3, 3),
+    #     label_shape=(3, 10),
+    #     iteration=10,
+    #     optimizer=opt.SGD(learning_rate=1.0),
+    # )
+
+    # fc_sigmoid_tc(file_name="fc_sigmoid_mse.info", loss_fn_str="mse")
+
+    # fc_sigmoid_tc(
+    #     file_name="fc_sigmoid_cross.info", loss_fn_str="cross_softmax",
+    # )
+
+    # fc_relu = [
+    #     K.Input(shape=(3)),
+    #     K.layers.Dense(10),
+    #     K.layers.Activation("relu"),
+    #     K.layers.Dense(2),
+    #     K.layers.Activation("sigmoid"),
+    # ]
+
+    # fc_relu_tc = partial(
+    #     record, model=fc_relu, input_shape=(3, 3), label_shape=(3, 2), iteration=10
+    # )
+
+    # fc_relu_tc(
+    #     file_name="fc_relu_mse.info",
+    #     loss_fn_str="mse",
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    # )
+
+    # fc_bn_sigmoid = [
+    #     K.Input(shape=(3)),
+    #     K.layers.Dense(10),
+    #     K.layers.BatchNormalization(),
+    #     K.layers.Activation("sigmoid"),
+    #     K.layers.Dense(10),
+    #     K.layers.Activation("softmax"),
+    # ]
+
+    # fc_bn_sigmoid_tc = partial(
+    #     record,
+    #     model=fc_bn_sigmoid,
+    #     input_shape=(3, 3),
+    #     label_shape=(3, 10),
+    #     optimizer=opt.SGD(learning_rate=1),
+    #     iteration=10,
+    # )
+
+    # fc_bn_sigmoid_tc(
+    #     file_name="fc_bn_sigmoid_cross.info",
+    #     loss_fn_str="cross_softmax",
+    #     # debug=["summary", "iteration", "weights"],
+    # )
+
+    # fc_bn_sigmoid_tc(
+    #     file_name="fc_bn_sigmoid_mse.info", loss_fn_str="mse",
+    # )
+
+    # _mnist_block = lambda filter_size: [
+    #     K.layers.Conv2D(filters=filter_size, kernel_size=(3, 4)),
+    #     K.layers.Activation("sigmoid"),
+    #     K.layers.AveragePooling2D(pool_size=(2, 2)),
+    # ]
+
+    # mnist_conv = [
+    #     K.Input(shape=(2, 4, 5)),
+    #     *_mnist_block(2),
+    #     K.layers.Flatten(),
+    #     K.layers.Dense(10),
+    #     K.layers.Activation("softmax"),
+    # ]
+
+    # mnist_conv_tc = partial(
+    #     record, model=mnist_conv, optimizer=opt.SGD(learning_rate=0.1), iteration=10,
+    # )
+
+    # mnist_conv_tc(
+    #     input_shape=(3, 2, 4, 5),
+    #     label_shape=(3, 10),
+    #     file_name="mnist_conv_cross.info",
+    #     loss_fn_str="cross_softmax",
+    #     # debug=["summary", "loss", "layer_name", "initial_weights"],
+    # )
+
+    # mnist_conv_tc(
+    #     input_shape=(1, 2, 4, 5),
+    #     label_shape=(1, 10),
+    #     file_name="mnist_conv_cross_one_input.info",
+    #     loss_fn_str="cross_softmax",
+    #     # debug=["summary", "loss", "layer_name", "initial_weights"],
+    # )
+
+    # conv_nxn_model = lambda kernel_size: [
+    #     K.Input(shape=(2, 4, 5)),
+    #     K.layers.Conv2D(filters=4, kernel_size=kernel_size),
+    #     K.layers.Activation("sigmoid"),
+    #     K.layers.Flatten(),
+    #     K.layers.Dense(10),
+    #     K.layers.Activation("softmax"),
+    # ]
+
+    # conv_nxn_tc = partial(
+    #     record,
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 4, 5),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    # )
+
+    # # 1x1 kernel size
+    # conv_nxn_tc(
+    #     model=conv_nxn_model((1, 1)), file_name="conv_1x1.info",
+    # )
+
+    # # height width is same as input size
+    # conv_nxn_tc(
+    #     model=conv_nxn_model((4, 5)), file_name="conv_input_matches_kernel.info"
+    # )
+
+    # conv_layer_tc = lambda **conv_args: partial(
+    #     record,
+    #     model=[
+    #         K.Input(shape=(2, 5, 3)),
+    #         K.layers.Conv2D(filters=4, kernel_size=(3, 3), **conv_args),
+    #         K.layers.Activation("sigmoid"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 5, 3),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    # )
+
+    # conv_layer_tc()(file_name="conv_basic.info")
+    # conv_layer_tc(padding="same")(file_name="conv_same_padding.info")  # padding: 1, 1
+    # conv_layer_tc(strides=(2, 2))(file_name="conv_multi_stride.info")
+    # conv_layer_tc(padding="same", strides=(2, 2))(  # padding: 1, 1
+    #     file_name="conv_same_padding_multi_stride.info"
+    # )
+
+    # conv_layer_tc(strides=(3, 3))(file_name="conv_uneven_strides.info")
+
+    # record(
+    #     file_name="conv_uneven_strides2.info",
+    #     model=[
+    #         K.Input(shape=(2, 4, 4)),
+    #         K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(1, 2)),
+    #         K.layers.Activation("sigmoid"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 4, 4),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    #     # debug="summary"
+    # )
+
+    # record(
+    #     file_name="conv_uneven_strides3.info",
+    #     model=[
+    #         K.Input(shape=(2, 4, 4)),
+    #         K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(2, 1)),
+    #         K.layers.Activation("sigmoid"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 4, 4),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    # )
+
+    # record(
+    #     file_name="conv_bn.info",
+    #     model=[
+    #         K.Input(shape=(2, 3, 5)),
+    #         K.layers.Conv2D(filters=2, kernel_size=(2, 2)),
+    #         K.layers.BatchNormalization(),
+    #         K.layers.Activation("relu"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 3, 5),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    #     # debug=["summary", "initial_weights"]
+    # )
+
+    # pool_layer_tc = lambda pool_layer: partial(
+    #     record,
+    #     model=[
+    #         K.Input(shape=(2, 5, 3)),
+    #         pool_layer,
+    #         K.layers.Activation("sigmoid"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 5, 3),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    # )
+
+    # pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="same"))(
+    #     file_name="pooling_max_same_padding.info",  # debug="output"
+    # )  # padding: 1, 1
+
+    # pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="valid"))(
+    #     file_name="pooling_max_valid_padding.info",  # debug="output"
+    # )  # padding: 1, 1
+
+    # pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="same"))(
+    #     file_name="pooling_avg_same_padding.info",  # debug="dx"
+    # )  # padding: 1, 1
+
+    # pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="valid"))(
+    #     file_name="pooling_avg_valid_padding.info",  # debug="dx"
+    # )
+
+    # pool_layer_tc(K.layers.GlobalAvgPool2D(data_format="channels_first"))(
+    #     file_name="pooling_global_avg.info",  # debug="summary"
+    # )
+
+    # pool_layer_tc(K.layers.GlobalMaxPool2D(data_format="channels_first"))(
+    #     file_name="pooling_global_max.info",  # debug="dx"
+    # )
+
+    # pool_layer_tc2 = lambda pool_layer: partial(
+    #     record,
+    #     model=[
+    #         K.Input(shape=(2, 3, 5)),
+    #         pool_layer,
+    #         K.layers.Activation("sigmoid"),
+    #         K.layers.Flatten(),
+    #         K.layers.Dense(10),
+    #         K.layers.Activation("softmax"),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(3, 2, 3, 5),
+    #     label_shape=(3, 10),
+    #     loss_fn_str="cross_softmax",
+    # )
+
+    # pool_layer_tc2(K.layers.MaxPooling2D(pool_size=3, strides=2, padding="same"))(
+    #     file_name="pooling_max_same_padding_multi_stride.info",  # debug="dx"
+    # )
+
+    # pool_layer_tc2(K.layers.AveragePooling2D(pool_size=3, strides=2, padding="same"))(
+    #     file_name="pooling_avg_same_padding_multi_stride.info",  # debug="output"
+    # )
+
+    # def addition_test():
+    #     # x -> [a, b] -> c
+    #     x = K.Input(shape=(2, 3, 5), name="x")
+    #     # because the sort order is x -> [b, a] -> c, b0 must out first.
+    #     b0, a0 = MultiOutLayer(num_output=2)(x)
+    #     a1 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=3, strides=2, padding="same", name="addition_a1"
+    #         )
+    #     )(a0)
+    #     a2 = K.layers.Activation("relu", name="addition_a2")(a1)
+    #     a3 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=3, padding="same", name="addition_a3"
+    #         )
+    #     )(a2)
+    #     b1 = TL(
+    #         K.layers.Conv2D(
+    #             filters=4, kernel_size=1, strides=2, padding="same", name="addition_b1"
+    #         )
+    #     )(b0)
+    #     c1 = K.layers.Add(name="addition_c1")([a3, b1])
+    #     c2 = K.layers.Flatten(name="addition_c2")(c1)
+    #     c3 = K.layers.Dense(10, name="addition_c3")(c2)
+    #     c4 = K.layers.Activation("softmax", name="addition_c4")(c3)
+
+    #     return x, [x, b0, b1, a0, a1, a2, a3, c1, c2, c3, c4]
+
+    # x, y = addition_test()
+    # record(
+    #     loss_fn_str="mse",
+    #     file_name="addition_resnet_like.info",
+    #     input_shape=(3, 2, 3, 5),
+    #     label_shape=(3, 10),
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     inputs=x,
+    #     outputs=y,
+    #     # debug=["name", "summary", "output", "initial_weights"],
+    # )
+
+
+    # def resnet18(num_class, input_shape):
+    #     def block(x, filters, kernel_size, downsample = False):
+    #         # because the sort order is x -> [b, a] -> c, b0 must out first.
+    #         b0, a0 = MultiOutLayer(num_output=2)(x)
+    #         a1 = TL(K.layers.Conv2D(kernel_size=kernel_size,
+    #                 strides= (1 if not downsample else 2),
+    #                 filters=filters,
+    #                 padding="same"))(a0)
+    #         a2 = TL(K.layers.BatchNormalization())(a1)
+    #         a3 = TL(K.layers.ReLU())(a2)
+    #         a4 = TL(K.layers.Conv2D(kernel_size=kernel_size,
+    #                 strides=1,
+    #                 filters=filters,
+    #                 padding="same"))(a3)
+
+    #         if downsample:
+    #             b1 = TL(K.layers.Conv2D(kernel_size=1,
+    #                     strides=2,
+    #                     filters=filters,
+    #                     padding="same"))(b0)
+    #         else:
+    #             b1 = b0
+    #         o1 = K.layers.Add()([a4, b1])
+    #         o2 = TL(K.layers.BatchNormalization())(o1)
+    #         o3 = K.layers.Activation("relu")(o2)
+
+    #         if (downsample):
+    #             ret_array = [a0, a1, a2, a3, a4, b0, b1, o1, o2, o3]
+    #         else:
+    #             ret_array = [a0, a1, a2, a3, a4, b0, o1, o2, o3]
+    #         return ret_array
+
+
+    #     # x -> [a, b] -> c
+    #     x = K.Input(shape=input_shape, name="x")
+    #     out_nodes = [x]
+    #     # initial section of resnet
+    #     conv0 = TL(K.layers.Conv2D(
+    #             filters=64, kernel_size=3, strides=1, padding="same"))
+    #     bn0 = TL(K.layers.BatchNormalization())
+    #     act0 = K.layers.Activation("relu")
+
+    #     out_nodes.append(conv0(out_nodes[-1]))
+    #     out_nodes.append(bn0(out_nodes[-1]))
+    #     out_nodes.append(act0(out_nodes[-1]))
+
+    #     # Add all the resnet blocks
+    #     out_nodes.extend(block(out_nodes[-1], 64, 3, False))
+    #     out_nodes.extend(block(out_nodes[-1], 64, 3, False))
+    #     out_nodes.extend(block(out_nodes[-1], 128, 3, True))
+    #     out_nodes.extend(block(out_nodes[-1], 128, 3, False))
+    #     out_nodes.extend(block(out_nodes[-1], 256, 3, True))
+    #     out_nodes.extend(block(out_nodes[-1], 256, 3, False))
+    #     out_nodes.extend(block(out_nodes[-1], 512, 3, True))
+    #     out_nodes.extend(block(out_nodes[-1], 512, 3, False))
+
+    #     # add the suffix part
+    #     pool0 = TL(K.layers.AveragePooling2D(pool_size=4))
+    #     flat0 = K.layers.Flatten()
+    #     dense0 = K.layers.Dense(num_class)
+    #     sm0 = K.layers.Activation("softmax")
+
+    #     out_nodes.append(pool0(out_nodes[-1]))
+    #     out_nodes.append(flat0(out_nodes[-1]))
+    #     out_nodes.append(dense0(out_nodes[-1]))
+    #     out_nodes.append(sm0(out_nodes[-1]))
+
+    #     return x, out_nodes
+
+    # x, y = resnet18(100, (3,32,32))
+    # record(
+    #     loss_fn_str="cross_softmax",
+    #     file_name="ResNet18.info",
+    #     input_shape=(2, 3, 32, 32),
+    #     label_shape=(2, 100),
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=2,
+    #     inputs=x,
+    #     outputs=y,
+    #     record_only_outputs=True
+    #     # debug=["file_shape_generation", "name"],
+    # )
+
+    # lstm_layer_tc = lambda batch, time, return_sequences: partial(
+    #     record,
+    #     model=[
+    #         K.Input(shape=(time, 1)),
+    #         K.layers.LSTM(
+    #             time,
+    #             recurrent_activation="sigmoid",
+    #             activation="tanh",
+    #             return_sequences=return_sequences,
+    #         ),
+    #         K.layers.Dense(1),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(batch, time, 1),
+    #     label_shape=(batch, time, 1),
+    #     is_onehot=False,
+    #     loss_fn_str="mse",
+    # )
+
+    # lstm_layer_tc(1, 1, False)(file_name="lstm_basic.info")
+    # lstm_layer_tc(1, 2, True)(file_name="lstm_return_sequence.info")
+    # lstm_layer_tc(2, 2, True)(file_name="lstm_return_sequence_with_batch.info")
+
+    # multi_lstm_layer_tc = lambda batch, time: partial(
+    #     record,
+    #     model=[
+    #         K.Input(batch_shape=(batch, time, 1)),
+    #         K.layers.LSTM(
+    #             time,
+    #             recurrent_activation="sigmoid",
+    #             activation="tanh",
+    #             return_sequences=True,
+    #         ),
+    #         K.layers.LSTM(time, recurrent_activation="sigmoid", activation="tanh"),
+    #         K.layers.Dense(1),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(batch, time, 1),
+    #     label_shape=(batch, 1),
+    #     is_onehot=False,
+    #     loss_fn_str="mse",
+    # )
+    # multi_lstm_layer_tc(1,2)(file_name="multi_lstm_return_sequence.info")
+    # multi_lstm_layer_tc(2,2)(file_name="multi_lstm_return_sequence_with_batch.info")
+
+    # rnn_layer_tc = lambda batch, time, return_sequences: partial(
+    #     record,
+    #     model=[
+    #         K.Input(shape=(time, 1)),
+    #         K.layers.SimpleRNN(2, return_sequences=return_sequences),
+    #         K.layers.Dense(1),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(batch, time, 1),
+    #     label_shape=(batch, time, 1),
+    #     is_onehot=False,
+    #     loss_fn_str="mse",
+    # )
+    # rnn_layer_tc(1, 1, False)(file_name="rnn_basic.info")
+    # rnn_layer_tc(1, 2, True)(file_name="rnn_return_sequences.info")
+    # rnn_layer_tc(2, 2, True)(file_name="rnn_return_sequence_with_batch.info")
+
+    # multi_rnn_layer_tc = lambda batch, time: partial(
+    #     record,
+    #     model=[
+    #         K.Input(batch_shape=(batch, time, 1)),
+    #         K.layers.SimpleRNN(
+    #             time,
+    #             return_sequences=True,
+    #         ),
+    #         K.layers.SimpleRNN(time),
+    #         K.layers.Dense(1),
+    #     ],
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     input_shape=(batch, time, 1),
+    #     label_shape=(batch, 1),
+    #     is_onehot=False,
+    #     loss_fn_str="mse",
+    # )
+    # multi_rnn_layer_tc(1,2)(file_name="multi_rnn_return_sequence.info")
+    # multi_rnn_layer_tc(2,2)(file_name="multi_rnn_return_sequence_with_batch.info")
+
+    gru_layer_tc = lambda batch, time, unit, feature_size, return_sequences, reset_after: partial(
         record,
         model=[
-            K.Input(batch_shape=(batch, time, 1)),
-            K.layers.SimpleRNN(
-                time,
-                return_sequences=True,
-            ),
-            K.layers.SimpleRNN(time),
-            K.layers.Dense(1),
-        ],
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        input_shape=(batch, time, 1),
-        label_shape=(batch, 1),
-        is_onehot=False,
-        loss_fn_str="mse",
-    )
-    multi_rnn_layer_tc(1,2)(file_name="multi_rnn_return_sequence.info")
-    multi_rnn_layer_tc(2,2)(file_name="multi_rnn_return_sequence_with_batch.info")
-
-    gru_layer_tc = lambda batch, time, return_sequences: partial(
-        record,
-        model=[
-            K.Input(batch_shape=(batch, time, 1)),
+            K.Input(batch_shape=(batch, time, feature_size)),
             K.layers.GRU(
-                time,
-                recurrent_activation="sigmoid",
+                unit,
                 activation="tanh",
+                recurrent_activation="sigmoid",
+                bias_initializer='GlorotUniform',
                 return_sequences=return_sequences,
+                reset_after=reset_after,
             ),
             K.layers.Dense(1),
         ],
         optimizer=opt.SGD(learning_rate=0.1),
         iteration=10,
-        input_shape=(batch, time, 1),
+        input_shape=(batch, time, feature_size),
         label_shape=(batch, time, 1),
         is_onehot=False,
         loss_fn_str="mse"
     )
 
-    gru_layer_tc(1, 1, False)(file_name="gru_basic.info")
-    gru_layer_tc(1, 2, True)(file_name="gru_return_sequence.info")
-    gru_layer_tc(2, 2, True)(file_name="gru_return_sequence_with_batch.info")
+    gru_layer_tc(1, 1, 3, 4, False, False)(file_name="gru_basic.info")
+    gru_layer_tc(1, 2, 3, 4, True, False)(file_name="gru_return_sequence.info")
+    gru_layer_tc(2, 2, 3, 4, True, False)(file_name="gru_return_sequence_with_batch.info")
+    # Check reset_after
+    gru_layer_tc(1, 1, 3, 4, False, True)(file_name="gru_reset_after_basic.info")
+    gru_layer_tc(1, 2, 3, 4, True, True)(file_name="gru_reset_after_return_sequence.info")
+    gru_layer_tc(2, 2, 3, 4, True, True)(file_name="gru_reset_after_return_sequence_with_batch.info")
 
-    multi_gru_layer_tc = lambda batch, time: partial(
+    multi_gru_layer_tc = lambda batch, time, unit, feature_size, reset_after: partial(
         record,
         model=[
-            K.Input(batch_shape=(batch, time, 1)),
+            K.Input(batch_shape=(batch, time, feature_size)),
             K.layers.GRU(
-                time,
-                recurrent_activation="sigmoid",
+                unit,
                 activation="tanh",
+                recurrent_activation="sigmoid",
+                bias_initializer='GlorotUniform',
                 return_sequences=True,
+                reset_after=reset_after,
             ),
-            K.layers.GRU(time, recurrent_activation="sigmoid", activation="tanh"),
+            K.layers.GRU(unit, activation="tanh", recurrent_activation="sigmoid", bias_initializer='GlorotUniform', reset_after=reset_after),
             K.layers.Dense(1),
         ],
         optimizer=opt.SGD(learning_rate=0.1),
         iteration=10,
-        input_shape=(batch, time, 1),
+        input_shape=(batch, time, feature_size),
         label_shape=(batch, 1),
         is_onehot=False,
         loss_fn_str="mse",
     )
-    multi_gru_layer_tc(1,2)(file_name="multi_gru_return_sequence.info")
-    multi_gru_layer_tc(2,2)(file_name="multi_gru_return_sequence_with_batch.info")
-
-    def multiout_test():
-        # x -> [a, b] -> c
-        x = K.Input(shape=(1, 10), name="x")
-        fc = K.layers.Dense(2, name="fc")(x)
-        b0, a0 = MultiOutLayer(num_output=2)(fc)
-        fc1 = K.layers.Dense(2, name="fc1")(a0)
-        fc2 = K.layers.Dense(2, name="fc2")(b0)
-        add1 = K.layers.Add(name="add_1")([fc1, fc2]) # [a, b] -> c
-        fc3 = K.layers.Dense(3, name="fc3")(add1)
-        sm = K.layers.Activation("softmax", name="sm")(fc3)
-
-        return x, [x, fc, b0, a0, fc1, fc2, add1, fc3, sm]
-
-    x, y = multiout_test()
-    record(
-        loss_fn_str="mse",
-        file_name="multiout_model.info",
-        input_shape=(3, 10),
-        label_shape=(3, 3),
-        optimizer=opt.SGD(learning_rate=0.1),
-        iteration=10,
-        inputs=x,
-        outputs=y,
-        # debug=["name", "summary", "output", "initial_weights"],
-    )
+    multi_gru_layer_tc(1, 2, 3, 4, False)(file_name="multi_gru_return_sequence.info")
+    multi_gru_layer_tc(2, 2, 3, 4, False)(file_name="multi_gru_return_sequence_with_batch.info")
+    # Check reset_after
+    multi_gru_layer_tc(1, 2, 3, 4, True)(file_name="multi_gru_reset_after_return_sequence.info")
+    multi_gru_layer_tc(2, 2, 3, 4, True)(file_name="multi_gru_reset_after_return_sequence_with_batch.info")
+
+    # def multiout_test():
+    #     # x -> [a, b] -> c
+    #     x = K.Input(shape=(1, 10), name="x")
+    #     fc = K.layers.Dense(2, name="fc")(x)
+    #     b0, a0 = MultiOutLayer(num_output=2)(fc)
+    #     fc1 = K.layers.Dense(2, name="fc1")(a0)
+    #     fc2 = K.layers.Dense(2, name="fc2")(b0)
+    #     add1 = K.layers.Add(name="add_1")([fc1, fc2]) # [a, b] -> c
+    #     fc3 = K.layers.Dense(3, name="fc3")(add1)
+    #     sm = K.layers.Activation("softmax", name="sm")(fc3)
+
+    #     return x, [x, fc, b0, a0, fc1, fc2, add1, fc3, sm]
+
+    # x, y = multiout_test()
+    # record(
+    #     loss_fn_str="mse",
+    #     file_name="multiout_model.info",
+    #     input_shape=(3, 10),
+    #     label_shape=(3, 3),
+    #     optimizer=opt.SGD(learning_rate=0.1),
+    #     iteration=10,
+    #     inputs=x,
+    #     outputs=y,
+    #     # debug=["name", "summary", "output", "initial_weights"],
+    # )
index c9422bc..f526096 100644 (file)
@@ -213,6 +213,19 @@ class MultiOutLayer(IdentityTransLayer):
 
         return [layer(tf_output) for layer in self.stub_layers]
 
+##
+# @brief Translayer for gru layer
+class GRUTransLayer(IdentityTransLayer):
+    def to_nntr_weights(self, tensorOrList):
+        bias = tensorOrList[2]
+        if bias.shape.rank == 2:
+            bias_ih, bias_hh = bias[0], bias[1]
+            return [tensorOrList[0], tensorOrList[1], bias_ih, bias_hh]
+        else:
+            return tensorOrList
+
+    def to_nntr_trainable_weights(self, tensorOrList):
+        return self.to_nntr_weights(tensorOrList)
 
 ##
 # @brief A factory function to attach translayer to existing layer
@@ -226,4 +239,7 @@ def attach_trans_layer(layer):
     if isinstance(layer, CHANNEL_LAST_LAYERS):
         return ChannelLastTransLayer(layer)
 
+    if isinstance(layer, K.layers.GRU):
+        return GRUTransLayer(layer)
+
     return layer
index 989b77a..9b7490c 100644 (file)
 #include <gru.h>
 #include <layers_common_tests.h>
 
-auto semantic_gru =
-  LayerSemanticsParamType(nntrainer::createLayer<nntrainer::GRULayer>,
-                          nntrainer::GRULayer::type, {"unit=1"}, 0, false, 1);
+auto semantic_gru = LayerSemanticsParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>, nntrainer::GRULayer::type,
+  {"unit=1", "integrate_bias=true", "reset_after=false"}, 0, false, 1);
 
 INSTANTIATE_TEST_CASE_P(GRU, LayerSemantics, ::testing::Values(semantic_gru));
 
 auto gru_single_step = LayerGoldenTestParamType(
-  nntrainer::createLayer<nntrainer::GRULayer>, {"unit=5"}, "3:1:1:7",
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "integrate_bias=true", "reset_after=false"}, "3:1:1:7",
   "gru_single_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
 
 auto gru_multi_step = LayerGoldenTestParamType(
-  nntrainer::createLayer<nntrainer::GRULayer>, {"unit=5"}, "3:1:4:7",
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "integrate_bias=true", "reset_after=false"}, "3:1:4:7",
   "gru_multi_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
 
-auto gru_single_step_seq = LayerGoldenTestParamType(
-  nntrainer::createLayer<nntrainer::GRULayer>,
-  {"unit=5", "return_sequences=true"}, "3:1:1:7",
-  "gru_single_step_seq.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto gru_single_step_seq =
+  LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRULayer>,
+                           {"unit=5", "return_sequences=true",
+                            "integrate_bias=true", "reset_after=false"},
+                           "3:1:1:7", "gru_single_step_seq.nnlayergolden",
+                           LayerGoldenTestParamOptions::DEFAULT);
 
-auto gru_multi_step_seq = LayerGoldenTestParamType(
-  nntrainer::createLayer<nntrainer::GRULayer>,
-  {"unit=5", "return_sequences=true"}, "3:1:4:7",
-  "gru_multi_step_seq.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto gru_multi_step_seq =
+  LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRULayer>,
+                           {"unit=5", "return_sequences=true",
+                            "integrate_bias=true", "reset_after=false"},
+                           "3:1:4:7", "gru_multi_step_seq.nnlayergolden",
+                           LayerGoldenTestParamOptions::DEFAULT);
 
 auto gru_multi_step_seq_act_orig = LayerGoldenTestParamType(
   nntrainer::createLayer<nntrainer::GRULayer>,
   {"unit=5", "return_sequences=true", "hidden_state_activation=tanh",
-   "recurrent_activation=sigmoid"},
+   "recurrent_activation=sigmoid", "integrate_bias=true", "reset_after=false"},
   "3:1:4:7", "gru_multi_step_seq.nnlayergolden",
   LayerGoldenTestParamOptions::DEFAULT);
 
 auto gru_multi_step_seq_act = LayerGoldenTestParamType(
   nntrainer::createLayer<nntrainer::GRULayer>,
   {"unit=5", "return_sequences=true", "hidden_state_activation=sigmoid",
-   "recurrent_activation=tanh"},
+   "recurrent_activation=tanh", "integrate_bias=true", "reset_after=false"},
   "3:1:4:7", "gru_multi_step_seq_act.nnlayergolden",
   LayerGoldenTestParamOptions::DEFAULT);
 
-INSTANTIATE_TEST_CASE_P(GRU, LayerGoldenTest,
-                        ::testing::Values(gru_single_step, gru_multi_step,
-                                          gru_single_step_seq,
-                                          gru_multi_step_seq,
-                                          gru_multi_step_seq_act_orig,
-                                          gru_multi_step_seq_act));
+// Check reset_after
+auto gru_reset_after_single_step = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "integrate_bias=false", "reset_after=true"}, "3:1:1:7",
+  "gru_reset_after_single_step.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "integrate_bias=false", "reset_after=true"}, "3:1:4:7",
+  "gru_reset_after_multi_step.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_single_step_seq = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "return_sequences=true", "integrate_bias=false",
+   "reset_after=true"},
+  "3:1:1:7", "gru_reset_after_single_step_seq.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "return_sequences=true", "integrate_bias=false",
+   "reset_after=true"},
+  "3:1:4:7", "gru_reset_after_multi_step_seq.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq_act_orig = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "return_sequences=true", "hidden_state_activation=tanh",
+   "recurrent_activation=sigmoid", "integrate_bias=false", "reset_after=true"},
+  "3:1:4:7", "gru_reset_after_multi_step_seq.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq_act = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::GRULayer>,
+  {"unit=5", "return_sequences=true", "hidden_state_activation=sigmoid",
+   "recurrent_activation=tanh", "integrate_bias=false", "reset_after=true"},
+  "3:1:4:7", "gru_reset_after_multi_step_seq_act.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+INSTANTIATE_TEST_CASE_P(
+  GRU, LayerGoldenTest,
+  ::testing::Values(gru_single_step, gru_multi_step, gru_single_step_seq,
+                    gru_multi_step_seq, gru_multi_step_seq_act_orig,
+                    gru_multi_step_seq_act, gru_reset_after_single_step,
+                    gru_reset_after_multi_step, gru_reset_after_single_step_seq,
+                    gru_reset_after_multi_step_seq,
+                    gru_reset_after_multi_step_seq_act_orig,
+                    gru_reset_after_multi_step_seq_act));
index 92e75b3..b331d54 100644 (file)
 #include <grucell.h>
 #include <layers_common_tests.h>
 
-auto semantic_grucell = LayerSemanticsParamType(
-  nntrainer::createLayer<nntrainer::GRUCellLayer>,
-  nntrainer::GRUCellLayer::type,
-  {"unit=1", "max_timestep=1", "timestep=0", "integrate_bias=true"}, 0, false,
-  1);
+auto semantic_grucell =
+  LayerSemanticsParamType(nntrainer::createLayer<nntrainer::GRUCellLayer>,
+                          nntrainer::GRUCellLayer::type,
+                          {"unit=1", "max_timestep=1", "timestep=0",
+                           "integrate_bias=false", "reset_after=true"},
+                          0, false, 1);
 
 INSTANTIATE_TEST_CASE_P(GRUCell, LayerSemantics,
                         ::testing::Values(semantic_grucell));
 
-auto grucell_single_step = LayerGoldenTestParamType(
-  nntrainer::createLayer<nntrainer::GRUCellLayer>,
-  {"unit=5", "max_timestep=1", "timestep=0", "integrate_bias=true"}, "3:1:1:7",
-  "gru_single_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto grucell_single_step =
+  LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRUCellLayer>,
+                           {"unit=5", "max_timestep=1", "timestep=0",
+                            "integrate_bias=true", "reset_after=false"},
+                           "3:1:1:7", "gru_single_step.nnlayergolden",
+                           LayerGoldenTestParamOptions::DEFAULT);
 
 INSTANTIATE_TEST_CASE_P(GRUCell, LayerGoldenTest,
                         ::testing::Values(grucell_single_step));
index 4dd0365..3a10fa8 100644 (file)
@@ -406,7 +406,8 @@ static std::unique_ptr<NeuralNetwork> makeSingleGRUCell() {
   }
 
   auto grucell = makeGraph({
-    {"grucell", {"name=a1", "unit=2", "integrate_bias=false"}},
+    {"grucell",
+     {"name=a1", "unit=2", "integrate_bias=false", "reset_after=true"}},
   });
 
   nn->addWithReferenceLayers(grucell, "grucell_scope", {"input"}, {"a1"},
@@ -436,9 +437,11 @@ static std::unique_ptr<NeuralNetwork> makeStackedGRUCell() {
   }
 
   auto grucell = makeGraph({
-    {"grucell", {"name=a1", "unit=2", "integrate_bias=false"}},
     {"grucell",
-     {"name=a2", "unit=2", "integrate_bias=false", "input_layers=a1"}},
+     {"name=a1", "unit=2", "integrate_bias=false", "reset_after=true"}},
+    {"grucell",
+     {"name=a2", "unit=2", "integrate_bias=false", "reset_after=true",
+      "input_layers=a1"}},
   });
 
   nn->addWithReferenceLayers(grucell, "grucell_scope", {"input"}, {"a1"},
index 5b59183..c765842 100644 (file)
@@ -669,9 +669,9 @@ INI gru_basic(
   {
     nn_base + "loss=mse | batch_size=1",
     sgd_base + "learning_rate = 0.1",
-    I("input") + input_base + "input_shape=1:1:1",
+    I("input") + input_base + "input_shape=1:1:4",
     I("gru") + gru_base +
-      "unit = 1" + "input_layers=input",
+      "unit = 3" + "input_layers=input" + "integrate_bias=true" + "reset_after=false",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
   }
 );
@@ -681,9 +681,9 @@ INI gru_return_sequence(
   {
     nn_base + "loss=mse | batch_size=1",
     sgd_base + "learning_rate = 0.1",
-    I("input") + input_base + "input_shape=1:2:1",
+    I("input") + input_base + "input_shape=1:2:4",
     I("gru") + gru_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
   }
 );
@@ -693,9 +693,9 @@ INI gru_return_sequence_with_batch(
   {
     nn_base + "loss=mse | batch_size=2",
     sgd_base + "learning_rate = 0.1",
-    I("input") + input_base + "input_shape=1:2:1",
+    I("input") + input_base + "input_shape=1:2:4",
     I("gru") + gru_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
   }
 );
@@ -705,11 +705,11 @@ INI multi_gru_return_sequence(
   {
     nn_base + "loss=mse | batch_size=1",
     sgd_base + "learning_rate = 0.1",
-    I("input") + input_base + "input_shape=1:2:1",
+    I("input") + input_base + "input_shape=1:2:4",
     I("gru") + gru_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
     I("gru2") + gru_base +
-      "unit = 2" + "input_layers=gru",
+      "unit = 3" + "input_layers=gru" + "integrate_bias=true" + "reset_after=false",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
   }
 );
@@ -719,11 +719,76 @@ INI multi_gru_return_sequence_with_batch(
   {
     nn_base + "loss=mse | batch_size=2",
     sgd_base + "learning_rate = 0.1",
-    I("input") + input_base + "input_shape=1:2:1",
+    I("input") + input_base + "input_shape=1:2:4",
+    I("gru") + gru_base +
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
+    I("gru2") + gru_base +
+      "unit = 3" + "input_layers=gru" + "integrate_bias=true" + "reset_after=false",
+    I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
+  }
+);
+
+// Check reset_after
+INI gru_reset_after_basic(
+  "gru_reset_after_basic",
+  {
+    nn_base + "loss=mse | batch_size=1",
+    sgd_base + "learning_rate = 0.1",
+    I("input") + input_base + "input_shape=1:1:4",
+    I("gru") + gru_base +
+      "unit = 3" + "input_layers=input" + "integrate_bias=false" + "reset_after=true",
+    I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+  }
+);
+
+INI gru_reset_after_return_sequence(
+  "gru_reset_after_return_sequence",
+  {
+    nn_base + "loss=mse | batch_size=1",
+    sgd_base + "learning_rate = 0.1",
+    I("input") + input_base + "input_shape=1:2:4",
+    I("gru") + gru_base +
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+    I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+  }
+);
+
+INI gru_reset_after_return_sequence_with_batch(
+  "gru_reset_after_return_sequence_with_batch",
+  {
+    nn_base + "loss=mse | batch_size=2",
+    sgd_base + "learning_rate = 0.1",
+    I("input") + input_base + "input_shape=1:2:4",
+    I("gru") + gru_base +
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+    I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+  }
+);
+
+INI multi_gru_reset_after_return_sequence(
+  "multi_gru_reset_after_return_sequence",
+  {
+    nn_base + "loss=mse | batch_size=1",
+    sgd_base + "learning_rate = 0.1",
+    I("input") + input_base + "input_shape=1:2:4",
+    I("gru") + gru_base +
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+    I("gru2") + gru_base +
+      "unit = 3" + "input_layers=gru" + "integrate_bias=false" + "reset_after=true",
+    I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
+  }
+);
+
+INI multi_gru_reset_after_return_sequence_with_batch(
+  "multi_gru_reset_after_return_sequence_with_batch",
+  {
+    nn_base + "loss=mse | batch_size=2",
+    sgd_base + "learning_rate = 0.1",
+    I("input") + input_base + "input_shape=1:2:4",
     I("gru") + gru_base +
-      "unit = 2" + "input_layers=input"+ "return_sequences=true",
+      "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
     I("gru2") + gru_base +
-      "unit = 2" + "input_layers=gru",
+      "unit = 3" + "input_layers=gru" + "integrate_bias=false" + "reset_after=true",
     I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
   }
 );
@@ -928,6 +993,11 @@ INSTANTIATE_TEST_CASE_P(
       mkModelIniTc(gru_return_sequence_with_batch, "2:1:2:1", 10, ModelTestOption::ALL),
       mkModelIniTc(multi_gru_return_sequence, "1:1:1:1", 10, ModelTestOption::ALL),
       mkModelIniTc(multi_gru_return_sequence_with_batch, "2:1:1:1", 10, ModelTestOption::ALL),
+      mkModelIniTc(gru_reset_after_basic, "1:1:1:1", 10, ModelTestOption::ALL),
+      mkModelIniTc(gru_reset_after_return_sequence, "1:1:2:1", 10, ModelTestOption::ALL),
+      mkModelIniTc(gru_reset_after_return_sequence_with_batch, "2:1:2:1", 10, ModelTestOption::ALL),
+      mkModelIniTc(multi_gru_reset_after_return_sequence, "1:1:1:1", 10, ModelTestOption::ALL),
+      mkModelIniTc(multi_gru_reset_after_return_sequence_with_batch, "2:1:1:1", 10, ModelTestOption::ALL),
 
       /**< multi output test */
       mkModelIniTc(multiple_output_model, "3:1:1:10", 10, ModelTestOption::COMPARE) // Todo: Enable option to ALL