- Enable bias_hh in gru.
- Enable reset_after in gru, grucell. If reset_after is set to true,
apply reset gate after matrix multiplication.
close #1768
Self evaluation:
Build test: [X]Passed [ ]Failed [ ]Skipped
Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: hyeonseok lee <hs89.lee@samsung.com>
};
/**
+ * @brief ResetAfter property, apply reset gate after matrix multiplication if
+ * this property is true. Apply before the multiplication if false. Used in gru,
+ * grucell.
+ *
+ */
+class ResetAfter : public nntrainer::Property<bool> {
+
+public:
+ /**
+ * @brief Construct a new ResetAfter object with a default value true
+ *
+ */
+ ResetAfter(bool value = true) : nntrainer::Property<bool>(value) {}
+ static constexpr const char *key = "reset_after"; /**< unique key to access */
+ using prop_tag = bool_prop_tag; /**< property type */
+};
+
+/**
* @brief Number of class
* @todo deprecate this
*/
static constexpr size_t SINGLE_INOUT_IDX = 0;
enum GRUParams {
- weight_xh,
+ weight_ih,
weight_hh,
bias_h,
+ bias_ih,
+ bias_hh,
hidden_state,
zrg,
h_prev,
GRULayer::GRULayer() :
LayerImpl(),
- gru_props(props::Unit(), props::HiddenStateActivation(),
- props::RecurrentActivation(), props::ReturnSequences(),
- props::DropOutRate()),
+ gru_props(props::Unit(),
+ props::HiddenStateActivation() = ActivationType::ACT_TANH,
+ props::RecurrentActivation() = ActivationType::ACT_SIGMOID,
+ props::ReturnSequences(), props::DropOutRate(),
+ props::IntegrateBias(), props::ResetAfter()),
acti_func(ActivationType::ACT_NONE, true),
recurrent_acti_func(ActivationType::ACT_NONE, true),
epsilon(1e-3) {
wt_idx.fill(std::numeric_limits<unsigned>::max());
}
-// - weight_xh ( input to hidden )
-// : [1, 1, input_size, unit (hidden_size) x NUM_GATE] -> z, r, g
-// - weight_hh ( hidden to hidden )
-// : [1, 1, unit (hidden_size) , unit (hidden_size) x NUM_GATE] -> z, r, g
-// - bias_h ( hidden bias )
-// : [1, 1, 1, unit (hidden_size) x NUM_GATE] -> z, r, g
void GRULayer::finalize(InitLayerContext &context) {
- auto &weight_regularizer =
- std::get<props::WeightRegularizer>(*layer_impl_props);
- auto &weight_regularizer_constant =
- std::get<props::WeightRegularizerConstant>(*layer_impl_props);
- auto &weight_initializer =
- std::get<props::WeightInitializer>(*layer_impl_props);
- auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
-
- auto unit = std::get<props::Unit>(gru_props).get();
- auto &hidden_state_activation_type =
- std::get<props::HiddenStateActivation>(gru_props);
- auto &recurrent_activation_type =
- std::get<props::RecurrentActivation>(gru_props);
- bool return_sequences = std::get<props::ReturnSequences>(gru_props);
- float dropout_rate = std::get<props::DropOutRate>(gru_props);
+ const Tensor::Initializer weight_initializer =
+ std::get<props::WeightInitializer>(*layer_impl_props).get();
+ const Tensor::Initializer bias_initializer =
+ std::get<props::BiasInitializer>(*layer_impl_props).get();
+ const WeightRegularizer weight_regularizer =
+ std::get<props::WeightRegularizer>(*layer_impl_props).get();
+ const float weight_regularizer_constant =
+ std::get<props::WeightRegularizerConstant>(*layer_impl_props).get();
+ const bool disable_bias =
+ std::get<props::DisableBias>(*layer_impl_props).get();
+
+ const unsigned int unit = std::get<props::Unit>(gru_props).get();
+ ActivationType hidden_state_activation_type =
+ std::get<props::HiddenStateActivation>(gru_props).get();
+ ActivationType recurrent_activation_type =
+ std::get<props::RecurrentActivation>(gru_props).get();
+ const bool return_sequences =
+ std::get<props::ReturnSequences>(gru_props).get();
+ const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+ const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
if (context.getNumInputs() != 1) {
throw std::invalid_argument("GRU layer takes only one input");
}
- TensorDim output_dim;
+ // input_dim = [ batch, 1, time_iteration, feature_size ]
const TensorDim &input_dim = context.getInputDimensions()[0];
+ const unsigned int batch_size = input_dim.batch();
+ const unsigned int max_timestep = input_dim.height();
+ const unsigned int feature_size = input_dim.width();
- // input_dim = [ batch, 1, time_iteration, feature_size ]
// if return_sequences == False :
- // output_dim = [ batch, 1, 1, hidden_size (unit)]
+ // output_dim = [ batch, 1, 1, unit ]
// else:
- // output_dim = [ batch, 1, time_iteration, hidden_size ( unit ) ]
- output_dim = input_dim;
- output_dim.width(unit);
-
- if (dropout_rate > epsilon) {
- wt_idx[GRUParams::dropout_mask] = context.requestTensor(
- output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
- TensorLifespan::ITERATION_LIFESPAN);
- }
-
- if (!return_sequences) {
- output_dim.height(1);
- }
-
+ // output_dim = [ batch, 1, time_iteration, unit ]
+ TensorDim output_dim(
+ {batch_size, 1, return_sequences ? max_timestep : 1, unit});
context.setOutputDimensions({output_dim});
- TensorDim bias_dim = TensorDim();
- bias_dim.setTensorDim(3, unit * NUM_GATE);
-
- TensorDim dim_xh = output_dim;
- dim_xh.height(input_dim.width());
- dim_xh.width(unit * NUM_GATE);
- dim_xh.batch(1);
-
- TensorDim dim_hh = output_dim;
- dim_hh.height(unit);
- dim_hh.width(unit * NUM_GATE);
- dim_hh.batch(1);
-
- // weight_initializer can be set seperately. weight_xh initializer,
+ // weight_initializer can be set seperately. weight_ih initializer,
// weight_hh initializer kernel initializer & recurrent_initializer in keras
// for now, it is set same way.
- wt_idx[GRUParams::weight_xh] =
- context.requestWeight(dim_xh, weight_initializer, weight_regularizer,
- weight_regularizer_constant, "weight_xh", true);
+
+ // - weight_ih ( input to hidden )
+ // weight_ih_dim : [ 1, 1, feature_size, NUMGATE * unit ] -> z, r, g
+ TensorDim weight_ih_dim({feature_size, NUM_GATE * unit});
+ wt_idx[GRUParams::weight_ih] =
+ context.requestWeight(weight_ih_dim, weight_initializer, weight_regularizer,
+ weight_regularizer_constant, "weight_ih", true);
+ // - weight_hh ( hidden to hidden )
+ // weight_hh_dim : [ 1, 1, unit, NUM_GATE * unit ] -> z, r, g
+ TensorDim weight_hh_dim({unit, NUM_GATE * unit});
wt_idx[GRUParams::weight_hh] =
- context.requestWeight(dim_hh, weight_initializer, weight_regularizer,
+ context.requestWeight(weight_hh_dim, weight_initializer, weight_regularizer,
weight_regularizer_constant, "weight_hh", true);
- wt_idx[GRUParams::bias_h] = context.requestWeight(
- bias_dim, bias_initializer, WeightRegularizer::NONE, 1.0f, "bias_h", true);
-
- TensorDim d = input_dim;
- d.width(unit);
+ if (!disable_bias) {
+ if (integrate_bias) {
+ // - bias_h ( input bias, hidden bias are integrate to 1 bias )
+ // bias_h_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+ TensorDim bias_h_dim({NUM_GATE * unit});
+ wt_idx[GRUParams::bias_h] =
+ context.requestWeight(bias_h_dim, bias_initializer,
+ WeightRegularizer::NONE, 1.0f, "bias_h", true);
+ } else {
+ // - bias_ih ( input bias )
+ // bias_ih_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+ TensorDim bias_ih_dim({NUM_GATE * unit});
+ wt_idx[GRUParams::bias_ih] =
+ context.requestWeight(bias_ih_dim, bias_initializer,
+ WeightRegularizer::NONE, 1.0f, "bias_ih", true);
+ // - bias_hh ( hidden bias )
+ // bias_hh_dim : [ 1, 1, 1, NUM_GATE * unit ] -> z, r, g
+ TensorDim bias_hh_dim({NUM_GATE * unit});
+ wt_idx[GRUParams::bias_hh] =
+ context.requestWeight(bias_hh_dim, bias_initializer,
+ WeightRegularizer::NONE, 1.0f, "bias_hh", true);
+ }
+ }
- wt_idx[GRUParams::hidden_state] =
- context.requestTensor(d, "hidden_state", Tensor::Initializer::NONE, true,
- TensorLifespan::ITERATION_LIFESPAN);
+ // hidden_state_dim = [ batch, 1, max_timestep, unit ]
+ TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit);
+ wt_idx[GRUParams::hidden_state] = context.requestTensor(
+ hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
+ TensorLifespan::ITERATION_LIFESPAN);
- d.width(unit * NUM_GATE);
+ // zrg_dim = [ batch, 1, max_timestep, NUM_GATE * unit ]
+ TensorDim zrg_dim(batch_size, 1, max_timestep, NUM_GATE * unit);
wt_idx[GRUParams::zrg] =
- context.requestTensor(d, "zrg", Tensor::Initializer::NONE, true,
+ context.requestTensor(zrg_dim, "zrg", Tensor::Initializer::NONE, true,
TensorLifespan::ITERATION_LIFESPAN);
- TensorDim h_dim = TensorDim();
- h_dim.setTensorDim(3, unit);
- h_dim.batch(input_dim.batch());
+ // h_prev_dim = [ batch, 1, 1, unit ]
+ TensorDim h_prev_dim = TensorDim({batch_size, 1, 1, unit});
wt_idx[GRUParams::h_prev] =
- context.requestTensor(h_dim, "h_prev", Tensor::Initializer::NONE, false,
- TensorLifespan::FORWARD_FUNC_LIFESPAN);
+ context.requestTensor(h_prev_dim, "h_prev", Tensor::Initializer::NONE,
+ false, TensorLifespan::FORWARD_FUNC_LIFESPAN);
- if (hidden_state_activation_type.get() == ActivationType::ACT_NONE) {
- hidden_state_activation_type.set(ActivationType::ACT_TANH);
+ if (dropout_rate > epsilon) {
+ TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit);
+ wt_idx[GRUParams::dropout_mask] = context.requestTensor(
+ output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
+ TensorLifespan::ITERATION_LIFESPAN);
}
- acti_func.setActiFunc(hidden_state_activation_type.get());
- if (recurrent_activation_type.get() == ActivationType::ACT_NONE) {
- recurrent_activation_type.set(ActivationType::ACT_SIGMOID);
- }
- recurrent_acti_func.setActiFunc(recurrent_activation_type.get());
+ acti_func.setActiFunc(hidden_state_activation_type);
+ recurrent_acti_func.setActiFunc(recurrent_activation_type);
}
void GRULayer::setProperty(const std::vector<std::string> &values) {
}
void GRULayer::forwarding(RunLayerContext &context, bool training) {
- auto unit = std::get<props::Unit>(gru_props).get();
- bool return_sequences = std::get<props::ReturnSequences>(gru_props);
- float dropout_rate = std::get<props::DropOutRate>(gru_props);
-
- Tensor &weight_xh = context.getWeight(wt_idx[GRUParams::weight_xh]);
- Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
- Tensor &bias_h = context.getWeight(wt_idx[GRUParams::bias_h]);
+ const bool disable_bias =
+ std::get<props::DisableBias>(*layer_impl_props).get();
+
+ const unsigned int unit = std::get<props::Unit>(gru_props).get();
+ const bool return_sequences =
+ std::get<props::ReturnSequences>(gru_props).get();
+ const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+ const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
+ const bool reset_after = std::get<props::ResetAfter>(gru_props).get();
+
+ Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+ const TensorDim &input_dim = input.getDim();
+ const unsigned int batch_size = input_dim.batch();
+ const unsigned int max_timestep = input_dim.height();
+ const unsigned int feature_size = input_dim.width();
+ Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
- Tensor &hidden_ = context.getTensor(wt_idx[GRUParams::hidden_state]);
- Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+ const Tensor &weight_ih = context.getWeight(wt_idx[GRUParams::weight_ih]);
+ const Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
+ Tensor empty;
+ Tensor &bias_h = !disable_bias && integrate_bias
+ ? context.getWeight(wt_idx[GRUParams::bias_h])
+ : empty;
+ Tensor &bias_ih = !disable_bias && !integrate_bias
+ ? context.getWeight(wt_idx[GRUParams::bias_ih])
+ : empty;
+ Tensor &bias_hh = !disable_bias && !integrate_bias
+ ? context.getWeight(wt_idx[GRUParams::bias_hh])
+ : empty;
+
+ Tensor &hidden_state = context.getTensor(wt_idx[GRUParams::hidden_state]);
Tensor &zrg = context.getTensor(wt_idx[GRUParams::zrg]);
Tensor &h_prev = context.getTensor(wt_idx[GRUParams::h_prev]);
- const TensorDim &input_dim = input_.getDim();
- hidden_.setZero();
+ hidden_state.setZero();
zrg.setZero();
h_prev.setZero();
- Tensor hs_prev;
+ Tensor prev_hs;
Tensor hs;
// zt = sigma(W_hz.h_prev + W_xz.xs)
// gt = tanh((h_prev*rt).W_hr + W_xg.xs)
// h_nx = (1-zt)*gt + zt*h_prev
- for (unsigned int b = 0; b < input_dim.batch(); ++b) {
- Tensor islice = input_.getBatchSlice(b, 1);
- Tensor oslice = hidden_.getBatchSlice(b, 1);
+ for (unsigned int b = 0; b < batch_size; ++b) {
+ Tensor islice = input.getBatchSlice(b, 1);
+ Tensor oslice = hidden_state.getBatchSlice(b, 1);
Tensor zrg_ = zrg.getBatchSlice(b, 1);
- for (unsigned int t = 0; t < islice.height(); ++t) {
- Tensor xs =
- islice.getSharedDataTensor({islice.width()}, t * islice.width());
+ for (unsigned int t = 0; t < max_timestep; ++t) {
+ Tensor xs = islice.getSharedDataTensor({feature_size}, t * feature_size);
/** @todo verify this dropout working */
// if (dropout_rate > 0.0 && training) {
// xs.multiply_i(xs.dropout_mask(dropout_rate));
// }
- hs = oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
+ hs = oslice.getSharedDataTensor({unit}, t * unit);
Tensor zrg_t =
zrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
if (t > 0) {
- hs_prev = oslice.getSharedDataTensor({oslice.width()},
- (t - 1) * oslice.width());
+ prev_hs = oslice.getSharedDataTensor({unit}, (t - 1) * unit);
} else {
- hs_prev = h_prev.getBatchSlice(b, 1);
+ prev_hs = h_prev.getBatchSlice(b, 1);
}
- xs.dot(weight_xh, zrg_t); // x_z, x_r, x_g
+ xs.dot(weight_ih, zrg_t); // x_z, x_r, x_g
Tensor ztrt = zrg_t.getSharedDataTensor({unit * 2}, 0);
- Tensor ztrt_b = bias_h.getSharedDataTensor({unit * 2}, 0);
Tensor w_hh;
w_hh.copy_with_stride(
weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false));
Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2);
- Tensor gt_b = bias_h.getSharedDataTensor({unit}, unit * 2);
- ztrt.add_i(hs_prev.dot(w_hh));
- ztrt.add_i(ztrt_b);
+ ztrt.add_i(prev_hs.dot(w_hh));
+ if (!disable_bias) {
+ if (integrate_bias) {
+ Tensor ztrt_bias_h = bias_h.getSharedDataTensor({unit * 2}, 0);
+ ztrt.add_i(ztrt_bias_h);
+ } else {
+ Tensor ztrt_bias_ih = bias_ih.getSharedDataTensor({unit * 2}, 0);
+ ztrt.add_i(ztrt_bias_ih);
+ Tensor ztrt_bias_hh = bias_hh.getSharedDataTensor({unit * 2}, 0);
+ ztrt.add_i(ztrt_bias_hh);
+ }
+ }
+
+ recurrent_acti_func.run_fn(ztrt, ztrt);
Tensor zt = ztrt.getSharedDataTensor({unit}, 0);
Tensor rt = ztrt.getSharedDataTensor({unit}, unit);
- recurrent_acti_func.run_fn(rt, rt);
- recurrent_acti_func.run_fn(zt, zt);
-
Tensor temp;
- rt.multiply(hs_prev, temp);
- gt.add_i(temp.dot(w_g));
- gt.add_i(gt_b);
+ if (reset_after) {
+ prev_hs.dot(w_g, temp);
+ if (!disable_bias && !integrate_bias) {
+ Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ temp.add_i(bias_hh_g);
+ }
+ temp.multiply_i(rt);
+ gt.add_i(temp);
+ } else {
+ rt.multiply(prev_hs, temp);
+ temp.dot(w_g, gt, false, false, 1.0f);
+ if (!disable_bias && !integrate_bias) {
+ Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ gt.add_i(bias_hh_g);
+ }
+ }
+ if (!disable_bias) {
+ if (integrate_bias) {
+ Tensor gt_bias_h = bias_h.getSharedDataTensor({unit}, unit * 2);
+ gt.add_i(gt_bias_h);
+ } else {
+ Tensor gt_bias_ih = bias_ih.getSharedDataTensor({unit}, unit * 2);
+ gt.add_i(gt_bias_ih);
+ }
+ }
+
acti_func.run_fn(gt, gt);
- zt.multiply(hs_prev, hs);
+ zt.multiply(prev_hs, hs);
temp = zt.multiply(-1.0).add(1.0);
hs.add_i(gt.multiply(temp));
if (dropout_rate > epsilon && training) {
Tensor mask_ = context.getTensor(wt_idx[GRUParams::dropout_mask])
.getBatchSlice(b, 1);
- Tensor msk =
- mask_.getSharedDataTensor({mask_.width()}, t * mask_.width());
+ Tensor msk = mask_.getSharedDataTensor({unit}, t * unit);
msk.dropout_mask(dropout_rate);
hs.multiply_i(msk);
}
}
}
- Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
if (!return_sequences) {
- TensorDim d = hidden_.getDim();
- for (unsigned int b = 0; b < input_dim.batch(); ++b) {
- Tensor dest = output.getSharedDataTensor({d.width()}, b * d.width());
- Tensor src = hidden_.getSharedDataTensor(
- {d.width()}, b * d.width() * d.height() + (d.height() - 1) * d.width());
+ for (unsigned int batch = 0; batch < batch_size; ++batch) {
+ Tensor dest = output.getSharedDataTensor({unit}, batch * unit);
+ Tensor src = hidden_state.getSharedDataTensor(
+ {unit}, batch * unit * max_timestep + (max_timestep - 1) * unit);
dest.copy(src);
}
} else {
- output.copy(hidden_);
+ output.copy(hidden_state);
}
}
void GRULayer::calcDerivative(RunLayerContext &context) {
- Tensor &derivative_ = context.getTensorGrad(wt_idx[GRUParams::zrg]);
- Tensor &weight = context.getWeight(wt_idx[GRUParams::weight_xh]);
- Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+ Tensor &zrg_derivative = context.getTensorGrad(wt_idx[GRUParams::zrg]);
+ Tensor &weight_ih = context.getWeight(wt_idx[GRUParams::weight_ih]);
+ Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
- derivative_.dot(weight, ret_, false, true);
+ zrg_derivative.dot(weight_ih, outgoing_derivative, false, true);
}
void GRULayer::calcGradient(RunLayerContext &context) {
- auto unit = std::get<props::Unit>(gru_props).get();
- bool return_sequences = std::get<props::ReturnSequences>(gru_props);
- float dropout_rate = std::get<props::DropOutRate>(gru_props);
-
- Tensor &djdw_x = context.getWeightGrad(wt_idx[GRUParams::weight_xh]);
- Tensor &djdw_h = context.getWeightGrad(wt_idx[GRUParams::weight_hh]);
- Tensor &djdb_h = context.getWeightGrad(wt_idx[GRUParams::bias_h]);
+ const bool disable_bias =
+ std::get<props::DisableBias>(*layer_impl_props).get();
+
+ const unsigned int unit = std::get<props::Unit>(gru_props).get();
+ const bool return_sequences =
+ std::get<props::ReturnSequences>(gru_props).get();
+ const float dropout_rate = std::get<props::DropOutRate>(gru_props).get();
+ const bool integrate_bias = std::get<props::IntegrateBias>(gru_props).get();
+ const bool reset_after = std::get<props::ResetAfter>(gru_props).get();
+
+ Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+ const TensorDim &input_dim = input.getDim();
+ const unsigned int batch_size = input_dim.batch();
+ const unsigned int max_timestep = input_dim.height();
+ const unsigned int feature_size = input_dim.width();
+ Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+
+ Tensor &djdweight_ih = context.getWeightGrad(wt_idx[GRUParams::weight_ih]);
Tensor &weight_hh = context.getWeight(wt_idx[GRUParams::weight_hh]);
-
- Tensor djdw_zr_h = Tensor({1, 1, unit, unit * 2}, true);
- Tensor djdw_g_h = Tensor({1, 1, unit, unit}, true);
- Tensor &derivative_ = context.getTensorGrad(wt_idx[GRUParams::hidden_state]);
- Tensor &hidden_ = context.getTensor(wt_idx[GRUParams::hidden_state]);
- Tensor &incoming_deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX);
- Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+ Tensor &djdweight_hh = context.getWeightGrad(wt_idx[GRUParams::weight_hh]);
+ Tensor empty;
+ Tensor &djdbias_h = !disable_bias && integrate_bias
+ ? context.getWeightGrad(wt_idx[GRUParams::bias_h])
+ : empty;
+ Tensor &djdbias_ih = !disable_bias && !integrate_bias
+ ? context.getWeightGrad(wt_idx[GRUParams::bias_ih])
+ : empty;
+ Tensor &bias_hh = !disable_bias && !integrate_bias
+ ? context.getWeight(wt_idx[GRUParams::bias_hh])
+ : empty;
+ Tensor &djdbias_hh = !disable_bias && !integrate_bias
+ ? context.getWeightGrad(wt_idx[GRUParams::bias_hh])
+ : empty;
+
+ Tensor djdweight_hh_zr = Tensor({1, 1, unit, unit * 2}, true);
+ Tensor djdweight_hh_g = Tensor({1, 1, unit, unit}, true);
+ Tensor &hidden_state_derivative =
+ context.getTensorGrad(wt_idx[GRUParams::hidden_state]);
+ Tensor &hidden_state = context.getTensor(wt_idx[GRUParams::hidden_state]);
Tensor &zrg = context.getTensor(wt_idx[GRUParams::zrg]);
Tensor &d_zrg = context.getTensorGrad(wt_idx[GRUParams::zrg]);
- const TensorDim &input_dim = input_.getDim();
- djdw_x.setZero();
- djdw_zr_h.setZero();
- djdw_g_h.setZero();
- djdb_h.setZero();
+ djdweight_ih.setZero();
+ djdweight_hh_zr.setZero();
+ djdweight_hh_g.setZero();
+ if (!disable_bias) {
+ if (integrate_bias) {
+ djdbias_h.setZero();
+ } else {
+ djdbias_ih.setZero();
+ djdbias_hh.setZero();
+ }
+ }
- derivative_.setZero();
+ hidden_state_derivative.setZero();
d_zrg.setZero();
if (!return_sequences) {
- TensorDim d = derivative_.getDim();
- for (unsigned int b = 0; b < input_dim.batch(); ++b) {
- Tensor dest = derivative_.getSharedDataTensor(
- {d.width()}, b * d.width() * d.height() + (d.height() - 1) * d.width());
+ for (unsigned int batch = 0; batch < batch_size; ++batch) {
+ Tensor dest = hidden_state_derivative.getSharedDataTensor(
+ {unit}, batch * unit * max_timestep + (max_timestep - 1) * unit);
Tensor src =
- incoming_deriv.getSharedDataTensor({d.width()}, b * d.width());
+ incoming_derivative.getSharedDataTensor({unit}, batch * unit);
dest.copy(src);
}
} else {
- derivative_.copy(incoming_deriv);
+ hidden_state_derivative.copy(incoming_derivative);
}
if (dropout_rate > epsilon) {
- derivative_.multiply_i(context.getTensor(wt_idx[GRUParams::dropout_mask]));
+ hidden_state_derivative.multiply_i(
+ context.getTensor(wt_idx[GRUParams::dropout_mask]));
}
- Tensor dh_nx = Tensor({derivative_.width()});
+ Tensor dh_nx = Tensor({unit});
- for (unsigned int b = 0; b < input_dim.batch(); ++b) {
- Tensor deriv_t = derivative_.getBatchSlice(b, 1);
- Tensor xs_t = input_.getBatchSlice(b, 1);
- Tensor hs_t = hidden_.getBatchSlice(b, 1);
+ for (unsigned int b = 0; b < batch_size; ++b) {
+ Tensor deriv_t = hidden_state_derivative.getBatchSlice(b, 1);
+ Tensor xs_t = input.getBatchSlice(b, 1);
+ Tensor hs_t = hidden_state.getBatchSlice(b, 1);
dh_nx.setZero();
Tensor dh;
- Tensor hs_prev;
+ Tensor prev_hs;
Tensor xs;
Tensor dzrg_ = d_zrg.getBatchSlice(b, 1);
Tensor zrg_ = zrg.getBatchSlice(b, 1);
- for (unsigned int t = deriv_t.height(); t-- > 0;) {
- dh = deriv_t.getSharedDataTensor({deriv_t.width()}, t * deriv_t.width());
- xs = xs_t.getSharedDataTensor({xs_t.width()}, t * xs_t.width());
+ for (unsigned int t = max_timestep; t-- > 0;) {
+ dh = deriv_t.getSharedDataTensor({unit}, t * unit);
+ xs = xs_t.getSharedDataTensor({feature_size}, t * feature_size);
Tensor dzrg_t =
dzrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
zrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
if (t == 0) {
- hs_prev = Tensor({hs_t.width()});
- hs_prev.setZero();
+ prev_hs = Tensor({unit});
+ prev_hs.setZero();
} else {
- hs_prev =
- hs_t.getSharedDataTensor({hs_t.width()}, (t - 1) * hs_t.width());
+ prev_hs = hs_t.getSharedDataTensor({unit}, (t - 1) * unit);
}
- if (t < deriv_t.height() - 1) {
+ if (t < max_timestep - 1) {
dh.add_i(dh_nx);
}
Tensor rt = zrg_t.getSharedDataTensor({unit}, unit);
Tensor gt = zrg_t.getSharedDataTensor({unit}, unit * 2);
- zt.multiply(dh, dh_nx); // dh_nx = d1
-
- dh.multiply(hs_prev, dhz); // dhz = d2
+ zt.multiply(dh, dh_nx); // dh_nx = d1
+ dh.multiply(prev_hs, dhz); // dhz = d2
dhz.subtract_i(gt.multiply(dh)); // dhz = d5
zt.multiply(-1.0, dhg);
dhg.add_i(1.0);
wzr_hh.copy_with_stride(
weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false));
- Tensor temp = Tensor({hs_t.width()});
- temp.setZero();
- dhg.dot(wg_hh, temp, false, true); // temp = d10
- hs_prev.multiply(temp, dhr); // dhr = d15
- temp.multiply_i(rt); // temp=d14
- dh_nx.add_i(temp); // dh_nx = d1 + d14
- // reset temp : hs_prev * rt for djdw_g_h
- hs_prev.multiply(rt, temp);
- recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
+ Tensor temp = Tensor({unit});
+
+ if (reset_after) {
+ prev_hs.dot(wg_hh, temp);
+ if (!disable_bias && !integrate_bias) {
+ const Tensor bias_hh_g =
+ bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ temp.add_i(bias_hh_g);
+ }
+ dhg.multiply(temp, dhr);
+
+ // reset temp: dhg * rt for djdbias_hh_g, dh_nx and djdweight_hh_g
+ dhg.multiply(rt, temp);
+ if (!disable_bias && !integrate_bias) {
+ Tensor djdbias_hh_g =
+ djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+ djdbias_hh_g.add_i(temp);
+ }
+ temp.dot(wg_hh, dh_nx, false, true, 1.0f); // dh_nx = d1 + d14
+ djdweight_hh_g.add_i(prev_hs.dot(temp, true, false));
+ } else {
+ if (!disable_bias && !integrate_bias) {
+ Tensor djdbias_hh_g =
+ djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+ djdbias_hh_g.add_i(dhg);
+ }
+
+ dhg.dot(wg_hh, temp, false, true); // temp = d10
+ temp.multiply(prev_hs, dhr); // dhr = d15s
+ temp.multiply_i(rt); // temp=d14
+ dh_nx.add_i(temp); // dh_nx = d1 + d14
+
+ // reset temp : prev_hs * rt for djdweight_hh_g
+ rt.multiply(prev_hs, temp);
+ temp.dot(dhg, djdweight_hh_g, true, false, 1.0f);
+ }
- djdb_h.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+ recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
- djdw_x.add_i(xs.dot(dzrg_t, true, false));
+ if (!disable_bias) {
+ if (integrate_bias) {
+ djdbias_h.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+ } else {
+ djdbias_ih.add_i(dzrg_t); // dzrg_t = d7+d16+d8
+ Tensor djdbias_hh_zr = djdbias_hh.getSharedDataTensor({2 * unit}, 0);
+ djdbias_hh_zr.add_i(dzrg_t.getSharedDataTensor({2 * unit}, 0));
+ }
+ }
- djdw_zr_h.add_i(hs_prev.dot(dhzr, true, false));
- djdw_g_h.add_i(temp.dot(dhg, true, false));
+ djdweight_hh_zr.add_i(prev_hs.dot(dhzr, true, false));
+ xs.dot(dzrg_t, djdweight_ih, true, false, 1.0f);
dhzr.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17
}
}
for (unsigned int h = 0; h < unit; ++h) {
- float *data = djdw_zr_h.getAddress(h * unit * 2);
- float *rdata = djdw_h.getAddress(h * unit * NUM_GATE);
+ float *data = djdweight_hh_zr.getAddress(h * unit * 2);
+ float *rdata = djdweight_hh.getAddress(h * unit * NUM_GATE);
std::copy(data, data + unit * 2, rdata);
}
for (unsigned int h = 0; h < unit; ++h) {
- float *data = djdw_g_h.getAddress(h * unit);
- float *rdata = djdw_h.getAddress(h * unit * NUM_GATE + unit * 2);
+ float *data = djdweight_hh_g.getAddress(h * unit);
+ float *rdata = djdweight_hh.getAddress(h * unit * NUM_GATE + unit * 2);
std::copy(data, data + unit, rdata);
}
}
* RecurrentActivation: activation type for recurrent. default is sigmoid
* ReturnSequence: option for return sequence
* DropOutRate: dropout rate
+ * IntegrateBias: integrate bias_ih, bias_hh to bias_h
+ * ResetAfter: Whether apply reset gate before/after the matrix
*
* */
std::tuple<props::Unit, props::HiddenStateActivation,
props::RecurrentActivation, props::ReturnSequences,
- props::DropOutRate>
+ props::DropOutRate, props::IntegrateBias, props::ResetAfter>
gru_props;
- std::array<unsigned int, 7> wt_idx; /**< indices of the weights */
+ std::array<unsigned int, 9> wt_idx; /**< indices of the weights */
/**
* @brief activation function for h_t : default is sigmoid
props::HiddenStateActivation() = ActivationType::ACT_TANH,
props::RecurrentActivation() = ActivationType::ACT_SIGMOID,
props::DropOutRate(), props::IntegrateBias(),
- props::MaxTimestep(), props::Timestep()),
+ props::ResetAfter(), props::MaxTimestep(), props::Timestep()),
acti_func(ActivationType::ACT_NONE, true),
recurrent_acti_func(ActivationType::ACT_NONE, true),
epsilon(1e-3) {
const float dropout_rate = std::get<props::DropOutRate>(grucell_props).get();
const bool integrate_bias =
std::get<props::IntegrateBias>(grucell_props).get();
+ const bool reset_after = std::get<props::ResetAfter>(grucell_props).get();
const unsigned int max_timestep =
std::get<props::MaxTimestep>(grucell_props).get();
const unsigned int timestep = std::get<props::Timestep>(grucell_props).get();
Tensor r_gate = zr_gate.getSharedDataTensor({batch_size, unit}, unit, false);
Tensor temp;
- prev_hidden_state.dot(weight_hh_g, temp);
- if (!disable_bias && !integrate_bias) {
- Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
- temp.add_i(bias_hh_g);
+ if (reset_after) {
+ prev_hidden_state.dot(weight_hh_g, temp);
+ if (!disable_bias && !integrate_bias) {
+ Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ temp.add_i(bias_hh_g);
+ }
+ temp.multiply_i_strided(r_gate);
+ g_gate.add_i_strided(temp);
+ } else {
+ r_gate.multiply_strided(prev_hidden_state, temp);
+ temp.dot(weight_hh_g, g_gate, false, false, 1.0f);
+ if (!disable_bias && !integrate_bias) {
+ Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ g_gate.add_i(bias_hh_g);
+ }
}
- temp.multiply_i_strided(r_gate);
- g_gate.add_i_strided(temp);
if (!disable_bias) {
if (integrate_bias) {
Tensor bias_h_g = bias_h.getSharedDataTensor({unit}, 2 * unit);
const float dropout_rate = std::get<props::DropOutRate>(grucell_props).get();
const bool integrate_bias =
std::get<props::IntegrateBias>(grucell_props).get();
+ const bool reset_after = std::get<props::ResetAfter>(grucell_props).get();
const unsigned int max_timestep =
std::get<props::MaxTimestep>(grucell_props).get();
const unsigned int timestep = std::get<props::Timestep>(grucell_props).get();
- Tensor &input = context.getInput(SINGLE_INOUT_IDX);
+ const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
const unsigned int batch_size = input.getDim().batch();
Tensor &djdweight_ih =
context.getWeightGrad(wt_idx[GRUCellParams::weight_ih]);
- Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]);
+ const Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]);
Tensor &djdweight_hh =
context.getWeightGrad(wt_idx[GRUCellParams::weight_hh]);
Tensor &djdbias_ih = !disable_bias && !integrate_bias
? context.getWeightGrad(wt_idx[GRUCellParams::bias_ih])
: empty;
- Tensor &bias_hh = !disable_bias && !integrate_bias
- ? context.getWeight(wt_idx[GRUCellParams::bias_hh])
- : empty;
+ const Tensor &bias_hh = !disable_bias && !integrate_bias
+ ? context.getWeight(wt_idx[GRUCellParams::bias_hh])
+ : empty;
Tensor &djdbias_hh = !disable_bias && !integrate_bias
? context.getWeightGrad(wt_idx[GRUCellParams::bias_hh])
: empty;
Tensor &hidden_states =
context.getTensor(wt_idx[GRUCellParams::hidden_state]);
hidden_states.reshape({max_timestep, 1, batch_size, unit});
- Tensor hidden_state = hidden_states.getBatchSlice(timestep, 1);
Tensor &hidden_states_derivatives =
context.getTensorGrad(wt_idx[GRUCellParams::hidden_state]);
Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
Tensor temp = Tensor(batch_size, unit);
Tensor dhg_;
dhg_.copy_with_stride(dhg);
- prev_hidden_state.dot(wg_hh, temp);
- if (!disable_bias && !integrate_bias) {
- Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
- temp.add_i(bias_hh_g);
- }
- dhg_.multiply_strided(temp, dhr); // dhr = d15
- // reset temp : prev_hidden_state * rt for djdbias_hh_g and dh_nx
- dhg_.multiply_strided(rt, temp);
- if (!disable_bias && !integrate_bias) {
- Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
- temp.sum(2, djdbias_hh_g, 1.0, 1.0);
+ if (reset_after) {
+ prev_hidden_state.dot(wg_hh, temp);
+ if (!disable_bias && !integrate_bias) {
+ const Tensor bias_hh_g = bias_hh.getSharedDataTensor({unit}, 2 * unit);
+ temp.add_i(bias_hh_g);
+ }
+ dhg_.multiply_strided(temp, dhr); // dhr = d15
+
+ // reset temp: dhg_ * rt for djdbias_hh_g, dh_nx and djdweight_hh_g
+ dhg_.multiply_strided(rt, temp);
+ if (!disable_bias && !integrate_bias) {
+ Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+ temp.sum(2, djdbias_hh_g, 1.0, 1.0);
+ }
+ temp.dot(wg_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14
+ djdweight_hh_g.add_i_strided(prev_hidden_state.dot(temp, true, false));
+ } else {
+ if (!disable_bias && !integrate_bias) {
+ Tensor djdbias_hh_g = djdbias_hh.getSharedDataTensor({unit}, 2 * unit);
+ dhg.sum(2, djdbias_hh_g, 1.0, 1.0);
+ }
+
+ dhg_.dot(wg_hh, temp, false, true);
+ temp.multiply_strided(prev_hidden_state, dhr);
+ temp.multiply_strided(rt, dh_nx, 1.0f);
+
+ // reset temp: rt * prev_hidden_state for and djdweight_hh_g
+ rt.multiply_strided(prev_hidden_state, temp);
+ temp.dot(dhg_, djdweight_hh_g, true, false, 1.0f);
}
- temp.dot(wg_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14
recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16
}
}
- djdweight_ih.add_i(input.dot(zrg_gate_derivative, true, false));
-
Tensor dhzr_;
dhzr_.copy_with_stride(dhzr);
djdweight_hh_zr.add_i_strided(prev_hidden_state.dot(dhzr_, true, false));
- djdweight_hh_g.add_i_strided(prev_hidden_state.dot(temp, true, false));
+ input.dot(zrg_gate_derivative, djdweight_ih, true, false, 1.0f);
dhzr_.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17
}
* RecurrentActivation: activation type for recurrent. default is sigmoid
* DropOutRate: dropout rate
* IntegrateBias: integrate bias_ih, bias_hh to bias_h
+ * ResetAfter: Whether apply reset gate before/after the matrix
+ * multiplication. Apply reset gate after the mulplication if true.
* MaxTimeStep: Maximum timestep of gru
* TimeStep: timestep for which gru should operate
*
* */
std::tuple<props::Unit, props::HiddenStateActivation,
props::RecurrentActivation, props::DropOutRate,
- props::IntegrateBias, props::MaxTimestep, props::Timestep>
+ props::IntegrateBias, props::ResetAfter, props::MaxTimestep,
+ props::Timestep>
grucell_props;
std::array<unsigned int, 9> wt_idx; /**< indices of the weights */
return_state=False)
record_single(lstm, (3, 4, 7), "lstm_multi_step_seq_act")
- gru = K.layers.GRU(units=5, reset_after=False,
+ gru = K.layers.GRU(units=5, activation="tanh",
recurrent_activation="sigmoid",
- activation="tanh",
+ bias_initializer='GlorotUniform',
return_sequences=False,
- return_state=False)
+ return_state=False,
+ reset_after=False)
record_single(gru, (3, 1, 7), "gru_single_step")
record_single(gru, (3, 4, 7), "gru_multi_step")
- gru = K.layers.GRU(units=5, reset_after=False,
+ gru = K.layers.GRU(units=5, activation="tanh",
recurrent_activation="sigmoid",
- activation="tanh",
+ bias_initializer='GlorotUniform',
return_sequences=True,
- return_state=False)
+ return_state=False,
+ reset_after=False)
record_single(gru, (3, 1, 7), "gru_single_step_seq")
record_single(gru, (3, 4, 7), "gru_multi_step_seq", input_type='float')
- gru = K.layers.GRU(units=5, reset_after=False,
+ gru = K.layers.GRU(units=5, activation="sigmoid",
recurrent_activation="tanh",
- activation="sigmoid",
+ bias_initializer='GlorotUniform',
return_sequences=True,
- return_state=False)
+ return_state=False,
+ reset_after=False,)
record_single(gru, (3, 4, 7), "gru_multi_step_seq_act", input_type='float')
+ # check reset_after
+ gru = K.layers.GRU(units=5, activation="tanh",
+ recurrent_activation="sigmoid",
+ bias_initializer='GlorotUniform',
+ return_sequences=False,
+ return_state=False,
+ reset_after=True,)
+ record_single(gru, (3, 1, 7), "gru_reset_after_single_step")
+ record_single(gru, (3, 4, 7), "gru_reset_after_multi_step")
+
+ gru = K.layers.GRU(units=5, activation="tanh",
+ recurrent_activation="sigmoid",
+ bias_initializer='GlorotUniform',
+ return_sequences=True,
+ return_state=False,
+ reset_after=True)
+ record_single(gru, (3, 1, 7), "gru_reset_after_single_step_seq")
+ record_single(gru, (3, 4, 7), "gru_reset_after_multi_step_seq", input_type='float')
+
+ gru = K.layers.GRU(units=5, activation="sigmoid",
+ recurrent_activation="tanh",
+ bias_initializer='GlorotUniform',
+ return_sequences=True,
+ return_state=False,
+ reset_after=True)
+ record_single(gru, (3, 4, 7), "gru_reset_after_multi_step_seq_act", input_type='float')
+
dropout = K.layers.Dropout(rate=0.2)
record_single(dropout, (2, 3, 2, 3), "dropout_20_training", {"training": True})
record_single(dropout, (2, 3, 2, 3), "dropout_20_inference", {"training": False})
if __name__ == "__main__":
- def multiout_test():
- # x -> [a, b] -> c
- x = K.Input(shape=(2, 3, 5), name="x")
- # because the sort order is x -> [b, a] -> c, b0 must out first.
- b0, a0 = MultiOutLayer(num_output=2)(x)
- a1 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=3, strides=2, padding="same", name="multiout_a1"
- )
- )(a0)
- a2 = K.layers.Activation("relu", name="multiout_a2")(a1)
- a3 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=3, padding="same", name="multiout_a3"
- )
- )(a2)
- a4 = K.layers.Flatten(name="multiout_a4")(a3)
- a5 = K.layers.Dense(10, name="multiout_a5")(a4)
- a6 = K.layers.Activation("softmax", name="multiout_a6")(a5)
- b1 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=1, strides=2, padding="same", name="multiout_b1"
- )
- )(b0)
- b2 = K.layers.Flatten(name="multiout_b2")(b1)
- b3 = K.layers.Dense(10, name="multiout_b3")(b2)
- b4 = K.layers.Activation("softmax", name="multiout_b4")(b3)
-
- return x, [x, b0, b1, b2, b3, b4, a0, a1, a2, a3, a4, a5, a6]
-
- x, y = multiout_test()
- record(
- loss_fn_str="mse",
- file_name="multiple_output_model.info",
- input_shape=(3, 2, 3, 5),
- label_shape=(3, 10),
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- inputs=x,
- outputs=y,
- multi_out=[5, 12],
- # debug=["name", "summary", "output", "initial_weights"],
- )
-
- ## please generate all test cases since golden data format can change anytime
- fc_sigmoid = [
- K.Input(shape=(3, 3)),
- K.layers.Dense(5),
- K.layers.Activation("sigmoid"),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ]
-
- fc_sigmoid_tc = partial(
- record,
- model=fc_sigmoid,
- input_shape=(3, 3),
- label_shape=(3, 10),
- iteration=10,
- optimizer=opt.SGD(learning_rate=1.0),
- )
-
- fc_sigmoid_tc(file_name="fc_sigmoid_mse.info", loss_fn_str="mse")
-
- fc_sigmoid_tc(
- file_name="fc_sigmoid_cross.info", loss_fn_str="cross_softmax",
- )
-
- fc_relu = [
- K.Input(shape=(3)),
- K.layers.Dense(10),
- K.layers.Activation("relu"),
- K.layers.Dense(2),
- K.layers.Activation("sigmoid"),
- ]
-
- fc_relu_tc = partial(
- record, model=fc_relu, input_shape=(3, 3), label_shape=(3, 2), iteration=10
- )
-
- fc_relu_tc(
- file_name="fc_relu_mse.info",
- loss_fn_str="mse",
- optimizer=opt.SGD(learning_rate=0.1),
- )
-
- fc_bn_sigmoid = [
- K.Input(shape=(3)),
- K.layers.Dense(10),
- K.layers.BatchNormalization(),
- K.layers.Activation("sigmoid"),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ]
-
- fc_bn_sigmoid_tc = partial(
- record,
- model=fc_bn_sigmoid,
- input_shape=(3, 3),
- label_shape=(3, 10),
- optimizer=opt.SGD(learning_rate=1),
- iteration=10,
- )
-
- fc_bn_sigmoid_tc(
- file_name="fc_bn_sigmoid_cross.info",
- loss_fn_str="cross_softmax",
- # debug=["summary", "iteration", "weights"],
- )
-
- fc_bn_sigmoid_tc(
- file_name="fc_bn_sigmoid_mse.info", loss_fn_str="mse",
- )
-
- _mnist_block = lambda filter_size: [
- K.layers.Conv2D(filters=filter_size, kernel_size=(3, 4)),
- K.layers.Activation("sigmoid"),
- K.layers.AveragePooling2D(pool_size=(2, 2)),
- ]
-
- mnist_conv = [
- K.Input(shape=(2, 4, 5)),
- *_mnist_block(2),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ]
-
- mnist_conv_tc = partial(
- record, model=mnist_conv, optimizer=opt.SGD(learning_rate=0.1), iteration=10,
- )
-
- mnist_conv_tc(
- input_shape=(3, 2, 4, 5),
- label_shape=(3, 10),
- file_name="mnist_conv_cross.info",
- loss_fn_str="cross_softmax",
- # debug=["summary", "loss", "layer_name", "initial_weights"],
- )
-
- mnist_conv_tc(
- input_shape=(1, 2, 4, 5),
- label_shape=(1, 10),
- file_name="mnist_conv_cross_one_input.info",
- loss_fn_str="cross_softmax",
- # debug=["summary", "loss", "layer_name", "initial_weights"],
- )
-
- conv_nxn_model = lambda kernel_size: [
- K.Input(shape=(2, 4, 5)),
- K.layers.Conv2D(filters=4, kernel_size=kernel_size),
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ]
-
- conv_nxn_tc = partial(
- record,
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 4, 5),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- )
-
- # 1x1 kernel size
- conv_nxn_tc(
- model=conv_nxn_model((1, 1)), file_name="conv_1x1.info",
- )
-
- # height width is same as input size
- conv_nxn_tc(
- model=conv_nxn_model((4, 5)), file_name="conv_input_matches_kernel.info"
- )
-
- conv_layer_tc = lambda **conv_args: partial(
- record,
- model=[
- K.Input(shape=(2, 5, 3)),
- K.layers.Conv2D(filters=4, kernel_size=(3, 3), **conv_args),
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 5, 3),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- )
-
- conv_layer_tc()(file_name="conv_basic.info")
- conv_layer_tc(padding="same")(file_name="conv_same_padding.info") # padding: 1, 1
- conv_layer_tc(strides=(2, 2))(file_name="conv_multi_stride.info")
- conv_layer_tc(padding="same", strides=(2, 2))( # padding: 1, 1
- file_name="conv_same_padding_multi_stride.info"
- )
-
- conv_layer_tc(strides=(3, 3))(file_name="conv_uneven_strides.info")
-
- record(
- file_name="conv_uneven_strides2.info",
- model=[
- K.Input(shape=(2, 4, 4)),
- K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(1, 2)),
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 4, 4),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- # debug="summary"
- )
-
- record(
- file_name="conv_uneven_strides3.info",
- model=[
- K.Input(shape=(2, 4, 4)),
- K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(2, 1)),
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 4, 4),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- )
-
- record(
- file_name="conv_bn.info",
- model=[
- K.Input(shape=(2, 3, 5)),
- K.layers.Conv2D(filters=2, kernel_size=(2, 2)),
- K.layers.BatchNormalization(),
- K.layers.Activation("relu"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 3, 5),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- # debug=["summary", "initial_weights"]
- )
-
- pool_layer_tc = lambda pool_layer: partial(
- record,
- model=[
- K.Input(shape=(2, 5, 3)),
- pool_layer,
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 5, 3),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- )
-
- pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="same"))(
- file_name="pooling_max_same_padding.info", # debug="output"
- ) # padding: 1, 1
-
- pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="valid"))(
- file_name="pooling_max_valid_padding.info", # debug="output"
- ) # padding: 1, 1
-
- pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="same"))(
- file_name="pooling_avg_same_padding.info", # debug="dx"
- ) # padding: 1, 1
-
- pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="valid"))(
- file_name="pooling_avg_valid_padding.info", # debug="dx"
- )
-
- pool_layer_tc(K.layers.GlobalAvgPool2D(data_format="channels_first"))(
- file_name="pooling_global_avg.info", # debug="summary"
- )
-
- pool_layer_tc(K.layers.GlobalMaxPool2D(data_format="channels_first"))(
- file_name="pooling_global_max.info", # debug="dx"
- )
-
- pool_layer_tc2 = lambda pool_layer: partial(
- record,
- model=[
- K.Input(shape=(2, 3, 5)),
- pool_layer,
- K.layers.Activation("sigmoid"),
- K.layers.Flatten(),
- K.layers.Dense(10),
- K.layers.Activation("softmax"),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(3, 2, 3, 5),
- label_shape=(3, 10),
- loss_fn_str="cross_softmax",
- )
-
- pool_layer_tc2(K.layers.MaxPooling2D(pool_size=3, strides=2, padding="same"))(
- file_name="pooling_max_same_padding_multi_stride.info", # debug="dx"
- )
-
- pool_layer_tc2(K.layers.AveragePooling2D(pool_size=3, strides=2, padding="same"))(
- file_name="pooling_avg_same_padding_multi_stride.info", # debug="output"
- )
-
- def addition_test():
- # x -> [a, b] -> c
- x = K.Input(shape=(2, 3, 5), name="x")
- # because the sort order is x -> [b, a] -> c, b0 must out first.
- b0, a0 = MultiOutLayer(num_output=2)(x)
- a1 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=3, strides=2, padding="same", name="addition_a1"
- )
- )(a0)
- a2 = K.layers.Activation("relu", name="addition_a2")(a1)
- a3 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=3, padding="same", name="addition_a3"
- )
- )(a2)
- b1 = TL(
- K.layers.Conv2D(
- filters=4, kernel_size=1, strides=2, padding="same", name="addition_b1"
- )
- )(b0)
- c1 = K.layers.Add(name="addition_c1")([a3, b1])
- c2 = K.layers.Flatten(name="addition_c2")(c1)
- c3 = K.layers.Dense(10, name="addition_c3")(c2)
- c4 = K.layers.Activation("softmax", name="addition_c4")(c3)
-
- return x, [x, b0, b1, a0, a1, a2, a3, c1, c2, c3, c4]
-
- x, y = addition_test()
- record(
- loss_fn_str="mse",
- file_name="addition_resnet_like.info",
- input_shape=(3, 2, 3, 5),
- label_shape=(3, 10),
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- inputs=x,
- outputs=y,
- # debug=["name", "summary", "output", "initial_weights"],
- )
-
-
- def resnet18(num_class, input_shape):
- def block(x, filters, kernel_size, downsample = False):
- # because the sort order is x -> [b, a] -> c, b0 must out first.
- b0, a0 = MultiOutLayer(num_output=2)(x)
- a1 = TL(K.layers.Conv2D(kernel_size=kernel_size,
- strides= (1 if not downsample else 2),
- filters=filters,
- padding="same"))(a0)
- a2 = TL(K.layers.BatchNormalization())(a1)
- a3 = TL(K.layers.ReLU())(a2)
- a4 = TL(K.layers.Conv2D(kernel_size=kernel_size,
- strides=1,
- filters=filters,
- padding="same"))(a3)
-
- if downsample:
- b1 = TL(K.layers.Conv2D(kernel_size=1,
- strides=2,
- filters=filters,
- padding="same"))(b0)
- else:
- b1 = b0
- o1 = K.layers.Add()([a4, b1])
- o2 = TL(K.layers.BatchNormalization())(o1)
- o3 = K.layers.Activation("relu")(o2)
-
- if (downsample):
- ret_array = [a0, a1, a2, a3, a4, b0, b1, o1, o2, o3]
- else:
- ret_array = [a0, a1, a2, a3, a4, b0, o1, o2, o3]
- return ret_array
-
-
- # x -> [a, b] -> c
- x = K.Input(shape=input_shape, name="x")
- out_nodes = [x]
- # initial section of resnet
- conv0 = TL(K.layers.Conv2D(
- filters=64, kernel_size=3, strides=1, padding="same"))
- bn0 = TL(K.layers.BatchNormalization())
- act0 = K.layers.Activation("relu")
-
- out_nodes.append(conv0(out_nodes[-1]))
- out_nodes.append(bn0(out_nodes[-1]))
- out_nodes.append(act0(out_nodes[-1]))
-
- # Add all the resnet blocks
- out_nodes.extend(block(out_nodes[-1], 64, 3, False))
- out_nodes.extend(block(out_nodes[-1], 64, 3, False))
- out_nodes.extend(block(out_nodes[-1], 128, 3, True))
- out_nodes.extend(block(out_nodes[-1], 128, 3, False))
- out_nodes.extend(block(out_nodes[-1], 256, 3, True))
- out_nodes.extend(block(out_nodes[-1], 256, 3, False))
- out_nodes.extend(block(out_nodes[-1], 512, 3, True))
- out_nodes.extend(block(out_nodes[-1], 512, 3, False))
-
- # add the suffix part
- pool0 = TL(K.layers.AveragePooling2D(pool_size=4))
- flat0 = K.layers.Flatten()
- dense0 = K.layers.Dense(num_class)
- sm0 = K.layers.Activation("softmax")
-
- out_nodes.append(pool0(out_nodes[-1]))
- out_nodes.append(flat0(out_nodes[-1]))
- out_nodes.append(dense0(out_nodes[-1]))
- out_nodes.append(sm0(out_nodes[-1]))
-
- return x, out_nodes
-
- x, y = resnet18(100, (3,32,32))
- record(
- loss_fn_str="cross_softmax",
- file_name="ResNet18.info",
- input_shape=(2, 3, 32, 32),
- label_shape=(2, 100),
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=2,
- inputs=x,
- outputs=y,
- record_only_outputs=True
- # debug=["file_shape_generation", "name"],
- )
-
- lstm_layer_tc = lambda batch, time, return_sequences: partial(
- record,
- model=[
- K.Input(shape=(time, 1)),
- K.layers.LSTM(
- time,
- recurrent_activation="sigmoid",
- activation="tanh",
- return_sequences=return_sequences,
- ),
- K.layers.Dense(1),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(batch, time, 1),
- label_shape=(batch, time, 1),
- is_onehot=False,
- loss_fn_str="mse",
- )
-
- lstm_layer_tc(1, 1, False)(file_name="lstm_basic.info")
- lstm_layer_tc(1, 2, True)(file_name="lstm_return_sequence.info")
- lstm_layer_tc(2, 2, True)(file_name="lstm_return_sequence_with_batch.info")
-
- multi_lstm_layer_tc = lambda batch, time: partial(
- record,
- model=[
- K.Input(batch_shape=(batch, time, 1)),
- K.layers.LSTM(
- time,
- recurrent_activation="sigmoid",
- activation="tanh",
- return_sequences=True,
- ),
- K.layers.LSTM(time, recurrent_activation="sigmoid", activation="tanh"),
- K.layers.Dense(1),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(batch, time, 1),
- label_shape=(batch, 1),
- is_onehot=False,
- loss_fn_str="mse",
- )
- multi_lstm_layer_tc(1,2)(file_name="multi_lstm_return_sequence.info")
- multi_lstm_layer_tc(2,2)(file_name="multi_lstm_return_sequence_with_batch.info")
-
- rnn_layer_tc = lambda batch, time, return_sequences: partial(
- record,
- model=[
- K.Input(shape=(time, 1)),
- K.layers.SimpleRNN(2, return_sequences=return_sequences),
- K.layers.Dense(1),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(batch, time, 1),
- label_shape=(batch, time, 1),
- is_onehot=False,
- loss_fn_str="mse",
- )
- rnn_layer_tc(1, 1, False)(file_name="rnn_basic.info")
- rnn_layer_tc(1, 2, True)(file_name="rnn_return_sequences.info")
- rnn_layer_tc(2, 2, True)(file_name="rnn_return_sequence_with_batch.info")
-
- multi_rnn_layer_tc = lambda batch, time: partial(
+ # def multiout_test():
+ # # x -> [a, b] -> c
+ # x = K.Input(shape=(2, 3, 5), name="x")
+ # # because the sort order is x -> [b, a] -> c, b0 must out first.
+ # b0, a0 = MultiOutLayer(num_output=2)(x)
+ # a1 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=3, strides=2, padding="same", name="multiout_a1"
+ # )
+ # )(a0)
+ # a2 = K.layers.Activation("relu", name="multiout_a2")(a1)
+ # a3 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=3, padding="same", name="multiout_a3"
+ # )
+ # )(a2)
+ # a4 = K.layers.Flatten(name="multiout_a4")(a3)
+ # a5 = K.layers.Dense(10, name="multiout_a5")(a4)
+ # a6 = K.layers.Activation("softmax", name="multiout_a6")(a5)
+ # b1 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=1, strides=2, padding="same", name="multiout_b1"
+ # )
+ # )(b0)
+ # b2 = K.layers.Flatten(name="multiout_b2")(b1)
+ # b3 = K.layers.Dense(10, name="multiout_b3")(b2)
+ # b4 = K.layers.Activation("softmax", name="multiout_b4")(b3)
+
+ # return x, [x, b0, b1, b2, b3, b4, a0, a1, a2, a3, a4, a5, a6]
+
+ # x, y = multiout_test()
+ # record(
+ # loss_fn_str="mse",
+ # file_name="multiple_output_model.info",
+ # input_shape=(3, 2, 3, 5),
+ # label_shape=(3, 10),
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # inputs=x,
+ # outputs=y,
+ # multi_out=[5, 12],
+ # # debug=["name", "summary", "output", "initial_weights"],
+ # )
+
+ # ## please generate all test cases since golden data format can change anytime
+ # fc_sigmoid = [
+ # K.Input(shape=(3, 3)),
+ # K.layers.Dense(5),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ]
+
+ # fc_sigmoid_tc = partial(
+ # record,
+ # model=fc_sigmoid,
+ # input_shape=(3, 3),
+ # label_shape=(3, 10),
+ # iteration=10,
+ # optimizer=opt.SGD(learning_rate=1.0),
+ # )
+
+ # fc_sigmoid_tc(file_name="fc_sigmoid_mse.info", loss_fn_str="mse")
+
+ # fc_sigmoid_tc(
+ # file_name="fc_sigmoid_cross.info", loss_fn_str="cross_softmax",
+ # )
+
+ # fc_relu = [
+ # K.Input(shape=(3)),
+ # K.layers.Dense(10),
+ # K.layers.Activation("relu"),
+ # K.layers.Dense(2),
+ # K.layers.Activation("sigmoid"),
+ # ]
+
+ # fc_relu_tc = partial(
+ # record, model=fc_relu, input_shape=(3, 3), label_shape=(3, 2), iteration=10
+ # )
+
+ # fc_relu_tc(
+ # file_name="fc_relu_mse.info",
+ # loss_fn_str="mse",
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # )
+
+ # fc_bn_sigmoid = [
+ # K.Input(shape=(3)),
+ # K.layers.Dense(10),
+ # K.layers.BatchNormalization(),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ]
+
+ # fc_bn_sigmoid_tc = partial(
+ # record,
+ # model=fc_bn_sigmoid,
+ # input_shape=(3, 3),
+ # label_shape=(3, 10),
+ # optimizer=opt.SGD(learning_rate=1),
+ # iteration=10,
+ # )
+
+ # fc_bn_sigmoid_tc(
+ # file_name="fc_bn_sigmoid_cross.info",
+ # loss_fn_str="cross_softmax",
+ # # debug=["summary", "iteration", "weights"],
+ # )
+
+ # fc_bn_sigmoid_tc(
+ # file_name="fc_bn_sigmoid_mse.info", loss_fn_str="mse",
+ # )
+
+ # _mnist_block = lambda filter_size: [
+ # K.layers.Conv2D(filters=filter_size, kernel_size=(3, 4)),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.AveragePooling2D(pool_size=(2, 2)),
+ # ]
+
+ # mnist_conv = [
+ # K.Input(shape=(2, 4, 5)),
+ # *_mnist_block(2),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ]
+
+ # mnist_conv_tc = partial(
+ # record, model=mnist_conv, optimizer=opt.SGD(learning_rate=0.1), iteration=10,
+ # )
+
+ # mnist_conv_tc(
+ # input_shape=(3, 2, 4, 5),
+ # label_shape=(3, 10),
+ # file_name="mnist_conv_cross.info",
+ # loss_fn_str="cross_softmax",
+ # # debug=["summary", "loss", "layer_name", "initial_weights"],
+ # )
+
+ # mnist_conv_tc(
+ # input_shape=(1, 2, 4, 5),
+ # label_shape=(1, 10),
+ # file_name="mnist_conv_cross_one_input.info",
+ # loss_fn_str="cross_softmax",
+ # # debug=["summary", "loss", "layer_name", "initial_weights"],
+ # )
+
+ # conv_nxn_model = lambda kernel_size: [
+ # K.Input(shape=(2, 4, 5)),
+ # K.layers.Conv2D(filters=4, kernel_size=kernel_size),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ]
+
+ # conv_nxn_tc = partial(
+ # record,
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 4, 5),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # )
+
+ # # 1x1 kernel size
+ # conv_nxn_tc(
+ # model=conv_nxn_model((1, 1)), file_name="conv_1x1.info",
+ # )
+
+ # # height width is same as input size
+ # conv_nxn_tc(
+ # model=conv_nxn_model((4, 5)), file_name="conv_input_matches_kernel.info"
+ # )
+
+ # conv_layer_tc = lambda **conv_args: partial(
+ # record,
+ # model=[
+ # K.Input(shape=(2, 5, 3)),
+ # K.layers.Conv2D(filters=4, kernel_size=(3, 3), **conv_args),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 5, 3),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # )
+
+ # conv_layer_tc()(file_name="conv_basic.info")
+ # conv_layer_tc(padding="same")(file_name="conv_same_padding.info") # padding: 1, 1
+ # conv_layer_tc(strides=(2, 2))(file_name="conv_multi_stride.info")
+ # conv_layer_tc(padding="same", strides=(2, 2))( # padding: 1, 1
+ # file_name="conv_same_padding_multi_stride.info"
+ # )
+
+ # conv_layer_tc(strides=(3, 3))(file_name="conv_uneven_strides.info")
+
+ # record(
+ # file_name="conv_uneven_strides2.info",
+ # model=[
+ # K.Input(shape=(2, 4, 4)),
+ # K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(1, 2)),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 4, 4),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # # debug="summary"
+ # )
+
+ # record(
+ # file_name="conv_uneven_strides3.info",
+ # model=[
+ # K.Input(shape=(2, 4, 4)),
+ # K.layers.Conv2D(filters=2, kernel_size=(2, 2), strides=(2, 1)),
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 4, 4),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # )
+
+ # record(
+ # file_name="conv_bn.info",
+ # model=[
+ # K.Input(shape=(2, 3, 5)),
+ # K.layers.Conv2D(filters=2, kernel_size=(2, 2)),
+ # K.layers.BatchNormalization(),
+ # K.layers.Activation("relu"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 3, 5),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # # debug=["summary", "initial_weights"]
+ # )
+
+ # pool_layer_tc = lambda pool_layer: partial(
+ # record,
+ # model=[
+ # K.Input(shape=(2, 5, 3)),
+ # pool_layer,
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 5, 3),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # )
+
+ # pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="same"))(
+ # file_name="pooling_max_same_padding.info", # debug="output"
+ # ) # padding: 1, 1
+
+ # pool_layer_tc(K.layers.MaxPooling2D(pool_size=3, strides=1, padding="valid"))(
+ # file_name="pooling_max_valid_padding.info", # debug="output"
+ # ) # padding: 1, 1
+
+ # pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="same"))(
+ # file_name="pooling_avg_same_padding.info", # debug="dx"
+ # ) # padding: 1, 1
+
+ # pool_layer_tc(K.layers.AveragePooling2D(pool_size=3, strides=1, padding="valid"))(
+ # file_name="pooling_avg_valid_padding.info", # debug="dx"
+ # )
+
+ # pool_layer_tc(K.layers.GlobalAvgPool2D(data_format="channels_first"))(
+ # file_name="pooling_global_avg.info", # debug="summary"
+ # )
+
+ # pool_layer_tc(K.layers.GlobalMaxPool2D(data_format="channels_first"))(
+ # file_name="pooling_global_max.info", # debug="dx"
+ # )
+
+ # pool_layer_tc2 = lambda pool_layer: partial(
+ # record,
+ # model=[
+ # K.Input(shape=(2, 3, 5)),
+ # pool_layer,
+ # K.layers.Activation("sigmoid"),
+ # K.layers.Flatten(),
+ # K.layers.Dense(10),
+ # K.layers.Activation("softmax"),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(3, 2, 3, 5),
+ # label_shape=(3, 10),
+ # loss_fn_str="cross_softmax",
+ # )
+
+ # pool_layer_tc2(K.layers.MaxPooling2D(pool_size=3, strides=2, padding="same"))(
+ # file_name="pooling_max_same_padding_multi_stride.info", # debug="dx"
+ # )
+
+ # pool_layer_tc2(K.layers.AveragePooling2D(pool_size=3, strides=2, padding="same"))(
+ # file_name="pooling_avg_same_padding_multi_stride.info", # debug="output"
+ # )
+
+ # def addition_test():
+ # # x -> [a, b] -> c
+ # x = K.Input(shape=(2, 3, 5), name="x")
+ # # because the sort order is x -> [b, a] -> c, b0 must out first.
+ # b0, a0 = MultiOutLayer(num_output=2)(x)
+ # a1 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=3, strides=2, padding="same", name="addition_a1"
+ # )
+ # )(a0)
+ # a2 = K.layers.Activation("relu", name="addition_a2")(a1)
+ # a3 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=3, padding="same", name="addition_a3"
+ # )
+ # )(a2)
+ # b1 = TL(
+ # K.layers.Conv2D(
+ # filters=4, kernel_size=1, strides=2, padding="same", name="addition_b1"
+ # )
+ # )(b0)
+ # c1 = K.layers.Add(name="addition_c1")([a3, b1])
+ # c2 = K.layers.Flatten(name="addition_c2")(c1)
+ # c3 = K.layers.Dense(10, name="addition_c3")(c2)
+ # c4 = K.layers.Activation("softmax", name="addition_c4")(c3)
+
+ # return x, [x, b0, b1, a0, a1, a2, a3, c1, c2, c3, c4]
+
+ # x, y = addition_test()
+ # record(
+ # loss_fn_str="mse",
+ # file_name="addition_resnet_like.info",
+ # input_shape=(3, 2, 3, 5),
+ # label_shape=(3, 10),
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # inputs=x,
+ # outputs=y,
+ # # debug=["name", "summary", "output", "initial_weights"],
+ # )
+
+
+ # def resnet18(num_class, input_shape):
+ # def block(x, filters, kernel_size, downsample = False):
+ # # because the sort order is x -> [b, a] -> c, b0 must out first.
+ # b0, a0 = MultiOutLayer(num_output=2)(x)
+ # a1 = TL(K.layers.Conv2D(kernel_size=kernel_size,
+ # strides= (1 if not downsample else 2),
+ # filters=filters,
+ # padding="same"))(a0)
+ # a2 = TL(K.layers.BatchNormalization())(a1)
+ # a3 = TL(K.layers.ReLU())(a2)
+ # a4 = TL(K.layers.Conv2D(kernel_size=kernel_size,
+ # strides=1,
+ # filters=filters,
+ # padding="same"))(a3)
+
+ # if downsample:
+ # b1 = TL(K.layers.Conv2D(kernel_size=1,
+ # strides=2,
+ # filters=filters,
+ # padding="same"))(b0)
+ # else:
+ # b1 = b0
+ # o1 = K.layers.Add()([a4, b1])
+ # o2 = TL(K.layers.BatchNormalization())(o1)
+ # o3 = K.layers.Activation("relu")(o2)
+
+ # if (downsample):
+ # ret_array = [a0, a1, a2, a3, a4, b0, b1, o1, o2, o3]
+ # else:
+ # ret_array = [a0, a1, a2, a3, a4, b0, o1, o2, o3]
+ # return ret_array
+
+
+ # # x -> [a, b] -> c
+ # x = K.Input(shape=input_shape, name="x")
+ # out_nodes = [x]
+ # # initial section of resnet
+ # conv0 = TL(K.layers.Conv2D(
+ # filters=64, kernel_size=3, strides=1, padding="same"))
+ # bn0 = TL(K.layers.BatchNormalization())
+ # act0 = K.layers.Activation("relu")
+
+ # out_nodes.append(conv0(out_nodes[-1]))
+ # out_nodes.append(bn0(out_nodes[-1]))
+ # out_nodes.append(act0(out_nodes[-1]))
+
+ # # Add all the resnet blocks
+ # out_nodes.extend(block(out_nodes[-1], 64, 3, False))
+ # out_nodes.extend(block(out_nodes[-1], 64, 3, False))
+ # out_nodes.extend(block(out_nodes[-1], 128, 3, True))
+ # out_nodes.extend(block(out_nodes[-1], 128, 3, False))
+ # out_nodes.extend(block(out_nodes[-1], 256, 3, True))
+ # out_nodes.extend(block(out_nodes[-1], 256, 3, False))
+ # out_nodes.extend(block(out_nodes[-1], 512, 3, True))
+ # out_nodes.extend(block(out_nodes[-1], 512, 3, False))
+
+ # # add the suffix part
+ # pool0 = TL(K.layers.AveragePooling2D(pool_size=4))
+ # flat0 = K.layers.Flatten()
+ # dense0 = K.layers.Dense(num_class)
+ # sm0 = K.layers.Activation("softmax")
+
+ # out_nodes.append(pool0(out_nodes[-1]))
+ # out_nodes.append(flat0(out_nodes[-1]))
+ # out_nodes.append(dense0(out_nodes[-1]))
+ # out_nodes.append(sm0(out_nodes[-1]))
+
+ # return x, out_nodes
+
+ # x, y = resnet18(100, (3,32,32))
+ # record(
+ # loss_fn_str="cross_softmax",
+ # file_name="ResNet18.info",
+ # input_shape=(2, 3, 32, 32),
+ # label_shape=(2, 100),
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=2,
+ # inputs=x,
+ # outputs=y,
+ # record_only_outputs=True
+ # # debug=["file_shape_generation", "name"],
+ # )
+
+ # lstm_layer_tc = lambda batch, time, return_sequences: partial(
+ # record,
+ # model=[
+ # K.Input(shape=(time, 1)),
+ # K.layers.LSTM(
+ # time,
+ # recurrent_activation="sigmoid",
+ # activation="tanh",
+ # return_sequences=return_sequences,
+ # ),
+ # K.layers.Dense(1),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(batch, time, 1),
+ # label_shape=(batch, time, 1),
+ # is_onehot=False,
+ # loss_fn_str="mse",
+ # )
+
+ # lstm_layer_tc(1, 1, False)(file_name="lstm_basic.info")
+ # lstm_layer_tc(1, 2, True)(file_name="lstm_return_sequence.info")
+ # lstm_layer_tc(2, 2, True)(file_name="lstm_return_sequence_with_batch.info")
+
+ # multi_lstm_layer_tc = lambda batch, time: partial(
+ # record,
+ # model=[
+ # K.Input(batch_shape=(batch, time, 1)),
+ # K.layers.LSTM(
+ # time,
+ # recurrent_activation="sigmoid",
+ # activation="tanh",
+ # return_sequences=True,
+ # ),
+ # K.layers.LSTM(time, recurrent_activation="sigmoid", activation="tanh"),
+ # K.layers.Dense(1),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(batch, time, 1),
+ # label_shape=(batch, 1),
+ # is_onehot=False,
+ # loss_fn_str="mse",
+ # )
+ # multi_lstm_layer_tc(1,2)(file_name="multi_lstm_return_sequence.info")
+ # multi_lstm_layer_tc(2,2)(file_name="multi_lstm_return_sequence_with_batch.info")
+
+ # rnn_layer_tc = lambda batch, time, return_sequences: partial(
+ # record,
+ # model=[
+ # K.Input(shape=(time, 1)),
+ # K.layers.SimpleRNN(2, return_sequences=return_sequences),
+ # K.layers.Dense(1),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(batch, time, 1),
+ # label_shape=(batch, time, 1),
+ # is_onehot=False,
+ # loss_fn_str="mse",
+ # )
+ # rnn_layer_tc(1, 1, False)(file_name="rnn_basic.info")
+ # rnn_layer_tc(1, 2, True)(file_name="rnn_return_sequences.info")
+ # rnn_layer_tc(2, 2, True)(file_name="rnn_return_sequence_with_batch.info")
+
+ # multi_rnn_layer_tc = lambda batch, time: partial(
+ # record,
+ # model=[
+ # K.Input(batch_shape=(batch, time, 1)),
+ # K.layers.SimpleRNN(
+ # time,
+ # return_sequences=True,
+ # ),
+ # K.layers.SimpleRNN(time),
+ # K.layers.Dense(1),
+ # ],
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # input_shape=(batch, time, 1),
+ # label_shape=(batch, 1),
+ # is_onehot=False,
+ # loss_fn_str="mse",
+ # )
+ # multi_rnn_layer_tc(1,2)(file_name="multi_rnn_return_sequence.info")
+ # multi_rnn_layer_tc(2,2)(file_name="multi_rnn_return_sequence_with_batch.info")
+
+ gru_layer_tc = lambda batch, time, unit, feature_size, return_sequences, reset_after: partial(
record,
model=[
- K.Input(batch_shape=(batch, time, 1)),
- K.layers.SimpleRNN(
- time,
- return_sequences=True,
- ),
- K.layers.SimpleRNN(time),
- K.layers.Dense(1),
- ],
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- input_shape=(batch, time, 1),
- label_shape=(batch, 1),
- is_onehot=False,
- loss_fn_str="mse",
- )
- multi_rnn_layer_tc(1,2)(file_name="multi_rnn_return_sequence.info")
- multi_rnn_layer_tc(2,2)(file_name="multi_rnn_return_sequence_with_batch.info")
-
- gru_layer_tc = lambda batch, time, return_sequences: partial(
- record,
- model=[
- K.Input(batch_shape=(batch, time, 1)),
+ K.Input(batch_shape=(batch, time, feature_size)),
K.layers.GRU(
- time,
- recurrent_activation="sigmoid",
+ unit,
activation="tanh",
+ recurrent_activation="sigmoid",
+ bias_initializer='GlorotUniform',
return_sequences=return_sequences,
+ reset_after=reset_after,
),
K.layers.Dense(1),
],
optimizer=opt.SGD(learning_rate=0.1),
iteration=10,
- input_shape=(batch, time, 1),
+ input_shape=(batch, time, feature_size),
label_shape=(batch, time, 1),
is_onehot=False,
loss_fn_str="mse"
)
- gru_layer_tc(1, 1, False)(file_name="gru_basic.info")
- gru_layer_tc(1, 2, True)(file_name="gru_return_sequence.info")
- gru_layer_tc(2, 2, True)(file_name="gru_return_sequence_with_batch.info")
+ gru_layer_tc(1, 1, 3, 4, False, False)(file_name="gru_basic.info")
+ gru_layer_tc(1, 2, 3, 4, True, False)(file_name="gru_return_sequence.info")
+ gru_layer_tc(2, 2, 3, 4, True, False)(file_name="gru_return_sequence_with_batch.info")
+ # Check reset_after
+ gru_layer_tc(1, 1, 3, 4, False, True)(file_name="gru_reset_after_basic.info")
+ gru_layer_tc(1, 2, 3, 4, True, True)(file_name="gru_reset_after_return_sequence.info")
+ gru_layer_tc(2, 2, 3, 4, True, True)(file_name="gru_reset_after_return_sequence_with_batch.info")
- multi_gru_layer_tc = lambda batch, time: partial(
+ multi_gru_layer_tc = lambda batch, time, unit, feature_size, reset_after: partial(
record,
model=[
- K.Input(batch_shape=(batch, time, 1)),
+ K.Input(batch_shape=(batch, time, feature_size)),
K.layers.GRU(
- time,
- recurrent_activation="sigmoid",
+ unit,
activation="tanh",
+ recurrent_activation="sigmoid",
+ bias_initializer='GlorotUniform',
return_sequences=True,
+ reset_after=reset_after,
),
- K.layers.GRU(time, recurrent_activation="sigmoid", activation="tanh"),
+ K.layers.GRU(unit, activation="tanh", recurrent_activation="sigmoid", bias_initializer='GlorotUniform', reset_after=reset_after),
K.layers.Dense(1),
],
optimizer=opt.SGD(learning_rate=0.1),
iteration=10,
- input_shape=(batch, time, 1),
+ input_shape=(batch, time, feature_size),
label_shape=(batch, 1),
is_onehot=False,
loss_fn_str="mse",
)
- multi_gru_layer_tc(1,2)(file_name="multi_gru_return_sequence.info")
- multi_gru_layer_tc(2,2)(file_name="multi_gru_return_sequence_with_batch.info")
-
- def multiout_test():
- # x -> [a, b] -> c
- x = K.Input(shape=(1, 10), name="x")
- fc = K.layers.Dense(2, name="fc")(x)
- b0, a0 = MultiOutLayer(num_output=2)(fc)
- fc1 = K.layers.Dense(2, name="fc1")(a0)
- fc2 = K.layers.Dense(2, name="fc2")(b0)
- add1 = K.layers.Add(name="add_1")([fc1, fc2]) # [a, b] -> c
- fc3 = K.layers.Dense(3, name="fc3")(add1)
- sm = K.layers.Activation("softmax", name="sm")(fc3)
-
- return x, [x, fc, b0, a0, fc1, fc2, add1, fc3, sm]
-
- x, y = multiout_test()
- record(
- loss_fn_str="mse",
- file_name="multiout_model.info",
- input_shape=(3, 10),
- label_shape=(3, 3),
- optimizer=opt.SGD(learning_rate=0.1),
- iteration=10,
- inputs=x,
- outputs=y,
- # debug=["name", "summary", "output", "initial_weights"],
- )
+ multi_gru_layer_tc(1, 2, 3, 4, False)(file_name="multi_gru_return_sequence.info")
+ multi_gru_layer_tc(2, 2, 3, 4, False)(file_name="multi_gru_return_sequence_with_batch.info")
+ # Check reset_after
+ multi_gru_layer_tc(1, 2, 3, 4, True)(file_name="multi_gru_reset_after_return_sequence.info")
+ multi_gru_layer_tc(2, 2, 3, 4, True)(file_name="multi_gru_reset_after_return_sequence_with_batch.info")
+
+ # def multiout_test():
+ # # x -> [a, b] -> c
+ # x = K.Input(shape=(1, 10), name="x")
+ # fc = K.layers.Dense(2, name="fc")(x)
+ # b0, a0 = MultiOutLayer(num_output=2)(fc)
+ # fc1 = K.layers.Dense(2, name="fc1")(a0)
+ # fc2 = K.layers.Dense(2, name="fc2")(b0)
+ # add1 = K.layers.Add(name="add_1")([fc1, fc2]) # [a, b] -> c
+ # fc3 = K.layers.Dense(3, name="fc3")(add1)
+ # sm = K.layers.Activation("softmax", name="sm")(fc3)
+
+ # return x, [x, fc, b0, a0, fc1, fc2, add1, fc3, sm]
+
+ # x, y = multiout_test()
+ # record(
+ # loss_fn_str="mse",
+ # file_name="multiout_model.info",
+ # input_shape=(3, 10),
+ # label_shape=(3, 3),
+ # optimizer=opt.SGD(learning_rate=0.1),
+ # iteration=10,
+ # inputs=x,
+ # outputs=y,
+ # # debug=["name", "summary", "output", "initial_weights"],
+ # )
return [layer(tf_output) for layer in self.stub_layers]
+##
+# @brief Translayer for gru layer
+class GRUTransLayer(IdentityTransLayer):
+ def to_nntr_weights(self, tensorOrList):
+ bias = tensorOrList[2]
+ if bias.shape.rank == 2:
+ bias_ih, bias_hh = bias[0], bias[1]
+ return [tensorOrList[0], tensorOrList[1], bias_ih, bias_hh]
+ else:
+ return tensorOrList
+
+ def to_nntr_trainable_weights(self, tensorOrList):
+ return self.to_nntr_weights(tensorOrList)
##
# @brief A factory function to attach translayer to existing layer
if isinstance(layer, CHANNEL_LAST_LAYERS):
return ChannelLastTransLayer(layer)
+ if isinstance(layer, K.layers.GRU):
+ return GRUTransLayer(layer)
+
return layer
#include <gru.h>
#include <layers_common_tests.h>
-auto semantic_gru =
- LayerSemanticsParamType(nntrainer::createLayer<nntrainer::GRULayer>,
- nntrainer::GRULayer::type, {"unit=1"}, 0, false, 1);
+auto semantic_gru = LayerSemanticsParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>, nntrainer::GRULayer::type,
+ {"unit=1", "integrate_bias=true", "reset_after=false"}, 0, false, 1);
INSTANTIATE_TEST_CASE_P(GRU, LayerSemantics, ::testing::Values(semantic_gru));
auto gru_single_step = LayerGoldenTestParamType(
- nntrainer::createLayer<nntrainer::GRULayer>, {"unit=5"}, "3:1:1:7",
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "integrate_bias=true", "reset_after=false"}, "3:1:1:7",
"gru_single_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
auto gru_multi_step = LayerGoldenTestParamType(
- nntrainer::createLayer<nntrainer::GRULayer>, {"unit=5"}, "3:1:4:7",
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "integrate_bias=true", "reset_after=false"}, "3:1:4:7",
"gru_multi_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
-auto gru_single_step_seq = LayerGoldenTestParamType(
- nntrainer::createLayer<nntrainer::GRULayer>,
- {"unit=5", "return_sequences=true"}, "3:1:1:7",
- "gru_single_step_seq.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto gru_single_step_seq =
+ LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true",
+ "integrate_bias=true", "reset_after=false"},
+ "3:1:1:7", "gru_single_step_seq.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
-auto gru_multi_step_seq = LayerGoldenTestParamType(
- nntrainer::createLayer<nntrainer::GRULayer>,
- {"unit=5", "return_sequences=true"}, "3:1:4:7",
- "gru_multi_step_seq.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto gru_multi_step_seq =
+ LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true",
+ "integrate_bias=true", "reset_after=false"},
+ "3:1:4:7", "gru_multi_step_seq.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
auto gru_multi_step_seq_act_orig = LayerGoldenTestParamType(
nntrainer::createLayer<nntrainer::GRULayer>,
{"unit=5", "return_sequences=true", "hidden_state_activation=tanh",
- "recurrent_activation=sigmoid"},
+ "recurrent_activation=sigmoid", "integrate_bias=true", "reset_after=false"},
"3:1:4:7", "gru_multi_step_seq.nnlayergolden",
LayerGoldenTestParamOptions::DEFAULT);
auto gru_multi_step_seq_act = LayerGoldenTestParamType(
nntrainer::createLayer<nntrainer::GRULayer>,
{"unit=5", "return_sequences=true", "hidden_state_activation=sigmoid",
- "recurrent_activation=tanh"},
+ "recurrent_activation=tanh", "integrate_bias=true", "reset_after=false"},
"3:1:4:7", "gru_multi_step_seq_act.nnlayergolden",
LayerGoldenTestParamOptions::DEFAULT);
-INSTANTIATE_TEST_CASE_P(GRU, LayerGoldenTest,
- ::testing::Values(gru_single_step, gru_multi_step,
- gru_single_step_seq,
- gru_multi_step_seq,
- gru_multi_step_seq_act_orig,
- gru_multi_step_seq_act));
+// Check reset_after
+auto gru_reset_after_single_step = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "integrate_bias=false", "reset_after=true"}, "3:1:1:7",
+ "gru_reset_after_single_step.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "integrate_bias=false", "reset_after=true"}, "3:1:4:7",
+ "gru_reset_after_multi_step.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_single_step_seq = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true", "integrate_bias=false",
+ "reset_after=true"},
+ "3:1:1:7", "gru_reset_after_single_step_seq.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true", "integrate_bias=false",
+ "reset_after=true"},
+ "3:1:4:7", "gru_reset_after_multi_step_seq.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq_act_orig = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true", "hidden_state_activation=tanh",
+ "recurrent_activation=sigmoid", "integrate_bias=false", "reset_after=true"},
+ "3:1:4:7", "gru_reset_after_multi_step_seq.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto gru_reset_after_multi_step_seq_act = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::GRULayer>,
+ {"unit=5", "return_sequences=true", "hidden_state_activation=sigmoid",
+ "recurrent_activation=tanh", "integrate_bias=false", "reset_after=true"},
+ "3:1:4:7", "gru_reset_after_multi_step_seq_act.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+INSTANTIATE_TEST_CASE_P(
+ GRU, LayerGoldenTest,
+ ::testing::Values(gru_single_step, gru_multi_step, gru_single_step_seq,
+ gru_multi_step_seq, gru_multi_step_seq_act_orig,
+ gru_multi_step_seq_act, gru_reset_after_single_step,
+ gru_reset_after_multi_step, gru_reset_after_single_step_seq,
+ gru_reset_after_multi_step_seq,
+ gru_reset_after_multi_step_seq_act_orig,
+ gru_reset_after_multi_step_seq_act));
#include <grucell.h>
#include <layers_common_tests.h>
-auto semantic_grucell = LayerSemanticsParamType(
- nntrainer::createLayer<nntrainer::GRUCellLayer>,
- nntrainer::GRUCellLayer::type,
- {"unit=1", "max_timestep=1", "timestep=0", "integrate_bias=true"}, 0, false,
- 1);
+auto semantic_grucell =
+ LayerSemanticsParamType(nntrainer::createLayer<nntrainer::GRUCellLayer>,
+ nntrainer::GRUCellLayer::type,
+ {"unit=1", "max_timestep=1", "timestep=0",
+ "integrate_bias=false", "reset_after=true"},
+ 0, false, 1);
INSTANTIATE_TEST_CASE_P(GRUCell, LayerSemantics,
::testing::Values(semantic_grucell));
-auto grucell_single_step = LayerGoldenTestParamType(
- nntrainer::createLayer<nntrainer::GRUCellLayer>,
- {"unit=5", "max_timestep=1", "timestep=0", "integrate_bias=true"}, "3:1:1:7",
- "gru_single_step.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+auto grucell_single_step =
+ LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::GRUCellLayer>,
+ {"unit=5", "max_timestep=1", "timestep=0",
+ "integrate_bias=true", "reset_after=false"},
+ "3:1:1:7", "gru_single_step.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
INSTANTIATE_TEST_CASE_P(GRUCell, LayerGoldenTest,
::testing::Values(grucell_single_step));
}
auto grucell = makeGraph({
- {"grucell", {"name=a1", "unit=2", "integrate_bias=false"}},
+ {"grucell",
+ {"name=a1", "unit=2", "integrate_bias=false", "reset_after=true"}},
});
nn->addWithReferenceLayers(grucell, "grucell_scope", {"input"}, {"a1"},
}
auto grucell = makeGraph({
- {"grucell", {"name=a1", "unit=2", "integrate_bias=false"}},
{"grucell",
- {"name=a2", "unit=2", "integrate_bias=false", "input_layers=a1"}},
+ {"name=a1", "unit=2", "integrate_bias=false", "reset_after=true"}},
+ {"grucell",
+ {"name=a2", "unit=2", "integrate_bias=false", "reset_after=true",
+ "input_layers=a1"}},
});
nn->addWithReferenceLayers(grucell, "grucell_scope", {"input"}, {"a1"},
{
nn_base + "loss=mse | batch_size=1",
sgd_base + "learning_rate = 0.1",
- I("input") + input_base + "input_shape=1:1:1",
+ I("input") + input_base + "input_shape=1:1:4",
I("gru") + gru_base +
- "unit = 1" + "input_layers=input",
+ "unit = 3" + "input_layers=input" + "integrate_bias=true" + "reset_after=false",
I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
}
);
{
nn_base + "loss=mse | batch_size=1",
sgd_base + "learning_rate = 0.1",
- I("input") + input_base + "input_shape=1:2:1",
+ I("input") + input_base + "input_shape=1:2:4",
I("gru") + gru_base +
- "unit = 2" + "input_layers=input"+ "return_sequences=true",
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
}
);
{
nn_base + "loss=mse | batch_size=2",
sgd_base + "learning_rate = 0.1",
- I("input") + input_base + "input_shape=1:2:1",
+ I("input") + input_base + "input_shape=1:2:4",
I("gru") + gru_base +
- "unit = 2" + "input_layers=input"+ "return_sequences=true",
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
}
);
{
nn_base + "loss=mse | batch_size=1",
sgd_base + "learning_rate = 0.1",
- I("input") + input_base + "input_shape=1:2:1",
+ I("input") + input_base + "input_shape=1:2:4",
I("gru") + gru_base +
- "unit = 2" + "input_layers=input"+ "return_sequences=true",
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
I("gru2") + gru_base +
- "unit = 2" + "input_layers=gru",
+ "unit = 3" + "input_layers=gru" + "integrate_bias=true" + "reset_after=false",
I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
}
);
{
nn_base + "loss=mse | batch_size=2",
sgd_base + "learning_rate = 0.1",
- I("input") + input_base + "input_shape=1:2:1",
+ I("input") + input_base + "input_shape=1:2:4",
+ I("gru") + gru_base +
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=true" + "reset_after=false",
+ I("gru2") + gru_base +
+ "unit = 3" + "input_layers=gru" + "integrate_bias=true" + "reset_after=false",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
+ }
+);
+
+// Check reset_after
+INI gru_reset_after_basic(
+ "gru_reset_after_basic",
+ {
+ nn_base + "loss=mse | batch_size=1",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:1:4",
+ I("gru") + gru_base +
+ "unit = 3" + "input_layers=input" + "integrate_bias=false" + "reset_after=true",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+ }
+);
+
+INI gru_reset_after_return_sequence(
+ "gru_reset_after_return_sequence",
+ {
+ nn_base + "loss=mse | batch_size=1",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:2:4",
+ I("gru") + gru_base +
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+ }
+);
+
+INI gru_reset_after_return_sequence_with_batch(
+ "gru_reset_after_return_sequence_with_batch",
+ {
+ nn_base + "loss=mse | batch_size=2",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:2:4",
+ I("gru") + gru_base +
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru"
+ }
+);
+
+INI multi_gru_reset_after_return_sequence(
+ "multi_gru_reset_after_return_sequence",
+ {
+ nn_base + "loss=mse | batch_size=1",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:2:4",
+ I("gru") + gru_base +
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
+ I("gru2") + gru_base +
+ "unit = 3" + "input_layers=gru" + "integrate_bias=false" + "reset_after=true",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
+ }
+);
+
+INI multi_gru_reset_after_return_sequence_with_batch(
+ "multi_gru_reset_after_return_sequence_with_batch",
+ {
+ nn_base + "loss=mse | batch_size=2",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:2:4",
I("gru") + gru_base +
- "unit = 2" + "input_layers=input"+ "return_sequences=true",
+ "unit = 3" + "input_layers=input"+ "return_sequences=true" + "integrate_bias=false" + "reset_after=true",
I("gru2") + gru_base +
- "unit = 2" + "input_layers=gru",
+ "unit = 3" + "input_layers=gru" + "integrate_bias=false" + "reset_after=true",
I("outputlayer") + fc_base + "unit = 1" + "input_layers=gru2"
}
);
mkModelIniTc(gru_return_sequence_with_batch, "2:1:2:1", 10, ModelTestOption::ALL),
mkModelIniTc(multi_gru_return_sequence, "1:1:1:1", 10, ModelTestOption::ALL),
mkModelIniTc(multi_gru_return_sequence_with_batch, "2:1:1:1", 10, ModelTestOption::ALL),
+ mkModelIniTc(gru_reset_after_basic, "1:1:1:1", 10, ModelTestOption::ALL),
+ mkModelIniTc(gru_reset_after_return_sequence, "1:1:2:1", 10, ModelTestOption::ALL),
+ mkModelIniTc(gru_reset_after_return_sequence_with_batch, "2:1:2:1", 10, ModelTestOption::ALL),
+ mkModelIniTc(multi_gru_reset_after_return_sequence, "1:1:1:1", 10, ModelTestOption::ALL),
+ mkModelIniTc(multi_gru_reset_after_return_sequence_with_batch, "2:1:1:1", 10, ModelTestOption::ALL),
/**< multi output test */
mkModelIniTc(multiple_output_model, "3:1:1:10", 10, ModelTestOption::COMPARE) // Todo: Enable option to ALL