This PR includes.
. unittest of rnn layer
. support of return sequence of rnn layer
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
ActivationType act = current.getActivationType();
- if (current.getType() == RNNLayer::type) {
- // No need to add activation layer for RNN Layer
- // Default activation is tanh
- if (act == ActivationType::ACT_NONE)
- act = ActivationType::ACT_TANH;
- current.setActivation(act);
- return status;
- }
-
if (act == ActivationType::ACT_NONE) {
/// ActivationType::ACT_NONE does not need realization
return ML_ERROR_NONE;
recurrent_acti_func.run_prime_fn(hf, dhf, dhf);
recurrent_acti_func.run_prime_fn(hi, dhi, dhi);
acti_func.run_prime_fn(hg, dhg, dhg);
-
djdb_h.add_i(dfgio_t);
djdw_x.add_i(xs.dot(dfgio_t, true, false));
djdw_h.add_i(hs_prev.dot(dfgio_t, true, false));
output_dim[0] = input_dim[0];
output_dim[0].width(unit);
+ if (!return_sequences) {
+ output_dim[0].height(1);
+ }
+
TensorDim bias_dim = TensorDim();
bias_dim.setTensorDim(3, unit);
h_prev = Tensor(bias_dim);
h_prev.setZero();
+ TensorDim d = input_dim[0];
+ d.width(unit);
+
+ // We do not need this if we reuse net_hidden[0]. But if we do, then the unit
+ // test will fail. Becuase it modifies the date during gradient calculation
+ // TODO : We could control with something like #define test to save memory
+ hidden = std::make_shared<Var_Grad>(d, true, true, "RNN:temp_hidden");
+ hidden->getVariableRef().setZero();
+ hidden->getGradientRef().setZero();
+
+ if (Layer::activation_type == ActivationType::ACT_NONE) {
+ Layer::activation_type = ActivationType::ACT_TANH;
+ acti_func.setActiFunc(activation_type);
+ }
+
return status;
}
acti_func.setActiFunc(acti_type);
}
break;
+ case PropertyType::return_sequences:
+ if (!value.empty()) {
+ status = setBoolean(return_sequences, value);
+ throw_status(status);
+ }
+ break;
default:
Layer::setProperty(type, value);
break;
Tensor &bias_h =
weightAt(static_cast<int>(RNNParams::bias_h)).getVariableRef();
- Tensor &hidden_ = net_hidden[0]->getVariableRef();
+ Tensor hidden_;
+ hidden_ = hidden->getVariableRef();
+
Tensor &input_ = net_input[0]->getVariableRef();
Tensor temp;
if (!training)
h_prev.getBatchSlice(b, 1).copy(hs);
}
+
+ if (!return_sequences) {
+ TensorDim d = hidden_.getDim();
+ for (unsigned int b = 0; b < input_dim[0].batch(); ++b) {
+ float *data = hidden_.getAddress(b * d.width() * d.height() +
+ (d.height() - 1) * d.width());
+ float *rdata = net_hidden[0]->getVariableRef().getAddress(b * d.width());
+ std::copy(data, data + d.width(), rdata);
+ }
+ } else {
+ net_hidden[0]->getVariableRef().copy(hidden_);
+ }
}
void RNNLayer::copy(std::shared_ptr<Layer> l) {
}
void RNNLayer::calcDerivative() {
- Tensor &derivative_ = net_hidden[0]->getGradientRef();
+ Tensor derivative_;
+ derivative_ = hidden->getGradientRef();
+
Tensor &weight =
weightAt(static_cast<int>(RNNParams::weight_xh)).getVariableRef();
Tensor &ret_ = net_input[0]->getGradientRef();
Tensor &weight_hh =
weightAt(static_cast<int>(RNNParams::weight_hh)).getVariableRef();
- Tensor &derivative_ = net_hidden[0]->getGradientRef();
- Tensor &hidden_ = net_hidden[0]->getVariableRef();
+ Tensor derivative_;
+ Tensor hidden_;
+ derivative_ = hidden->getGradientRef();
+
+ if (!return_sequences) {
+ TensorDim d = derivative_.getDim();
+ for (unsigned int b = 0; b < input_dim[0].batch(); ++b) {
+ float *data = derivative_.getAddress(b * d.width() * d.height() +
+ (d.height() - 1) * d.width());
+ float *rdata = net_hidden[0]->getGradientRef().getAddress(b * d.width());
+ std::copy(rdata, rdata + d.width(), data);
+ }
+ } else {
+ derivative_.copy(net_hidden[0]->getGradientRef());
+ }
+
+ hidden_ = hidden->getVariableRef();
+
Tensor &input_ = net_input[0]->getVariableRef();
Tensor dh_nx = Tensor(TensorDim(1, 1, 1, derivative_.width()));
}
acti_func.run_prime_fn(hs, dh, dh);
- dh.multiply_i(hs);
-
- float alpha = 1.0;
-
- if (b != 0) {
- alpha = 0.0;
- }
- djdb_h.add_i(dh, alpha);
- djdw_x.add_i(xs.dot(dh, true, false), alpha);
- djdw_h.add_i(hs_prev.dot(dh, true, false), alpha);
+ djdb_h.add_i(dh);
+ djdw_x.add_i(xs.dot(dh, true, false));
+ djdw_h.add_i(hs_prev.dot(dh, true, false));
dh.dot(weight_hh, dh_nx, false, true, 1.0);
}
}
* @brief Constructor of RNNLayer
*/
template <typename... Args>
- RNNLayer(unsigned int unit_ = 0, Args... args) : Layer(args...), unit(unit_) {
- /* Default Activation Type is tanh */
- if (getActivationType() == ActivationType::ACT_NONE)
- setActivation(ActivationType::ACT_TANH);
- }
+ RNNLayer(unsigned int unit_ = 0, bool sequence = false, Args... args) :
+ Layer(args...),
+ unit(unit_),
+ return_sequences(sequence){};
/**
* @brief Destructor of RNNLayer
* @brief To save hidden state variable ( batch, 1, 1, unit )
*/
Tensor h_prev;
+
+ /**
+ * @brief opiont for return sequence
+ */
+ bool return_sequences;
+
+ /**
+ * @brief hidden variable for rnn
+ */
+ std::shared_ptr<Var_Grad> hidden;
};
} // namespace nntrainer
loss_fn_str="mse"
)
multi_lstm_layer_return_sequence_with_batch_n(file_name="multi_lstm_return_sequence_with_batch_n.info", debug=["summary", "initial_weights", "dx", "output", "layer_name", "label","weights","gradients"],)
+
+
+ rnn_layer_tc = lambda rnn_layer: partial(
+ record,
+ model=[
+ K.Input(batch_shape=(1, 1, 1)),
+ rnn_layer,
+ K.layers.Dense(1)
+ ],
+ optimizer=opt.SGD(learning_rate=0.1),
+ iteration=1,
+ input_shape=(1,1,1),
+ label_shape=(1,1),
+ is_onehot=False,
+ loss_fn_str="mse"
+ )
+ rnn = K.layers.SimpleRNN(2)
+ rnn_layer_tc(rnn)(file_name="rnn_basic.info", debug=["summary", "initial_weights", "dx", "output", "layer_name", "label","weights","gradients"],)
+
+ rnn_layer_return_sequence_tc = lambda rnn_layer: partial(
+ record,
+ model=[
+ K.Input(batch_shape=(1, 2, 1)),
+ rnn_layer,
+ K.layers.Dense(1)
+ ],
+ optimizer=opt.SGD(learning_rate=0.1),
+ iteration=1,
+ input_shape=(1,2,1),
+ label_shape=(2,1),
+ is_onehot=False,
+ loss_fn_str="mse"
+ )
+
+ rnn = K.layers.SimpleRNN(2, return_sequences=True)
+ rnn_layer_return_sequence_tc(rnn)(file_name="rnn_return_sequences.info", debug=["summary", "initial_weights", "dx", "output", "layer_name", "label"],)
+
static std::string input_base = "type = input";
static std::string fc_base = "type = Fully_connected";
static std::string conv_base = "type = conv2d | stride = 1,1 | padding = 0,0";
+static std::string rnn_base = "type = rnn";
static std::string lstm_base = "type = lstm";
static std::string pooling_base = "type = pooling2d | padding = 0,0";
static std::string preprocess_flip_base = "type = preprocess_flip";
}
);
+INI rnn_basic(
+ "rnn_basic",
+ {
+ nn_base + "loss=mse | batch_size=1",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:1:1",
+ I("rnn") + rnn_base +
+ "unit = 2" + "input_layers=input",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn"
+ }
+);
+
+INI rnn_return_sequences(
+ "rnn_return_sequences",
+ {
+ nn_base + "loss=mse | batch_size=1",
+ sgd_base + "learning_rate = 0.1",
+ I("input") + input_base + "input_shape=1:2:1",
+ I("rnn") + rnn_base +
+ "unit = 2" + "input_layers=input" + "return_sequences=true",
+ I("outputlayer") + fc_base + "unit = 1" + "input_layers=rnn"
+ }
+);
+
INI multi_lstm_return_sequence(
"multi_lstm_return_sequence",
{
// mkModelTc(fc_softmax_mse_distribute_validate, "3:1:5:3", 1),
// mkModelTc(fc_softmax_cross_distribute_validate, "3:1:5:3", 1),
// mkModelTc(fc_sigmoid_cross_distribute_validate, "3:1:5:3", 1)
+ mkModelTc(rnn_basic, "1:1:1:1", 1),
+ mkModelTc(rnn_return_sequences, "1:1:2:1", 1),
mkModelTc(lstm_basic, "1:1:1:1", 1),
mkModelTc(lstm_return_sequence, "1:1:2:1", 1),
mkModelTc(lstm_return_sequence_with_batch, "2:1:2:1", 1),