From 8498a8bc7831198b13faf05f42dcc8eae18c91fc Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 18 May 2021 10:25:52 +0900
Subject: [PATCH] [ LSTM ] Add return_sequence

This commit includes,
  . impelementation of return_sequence. If it is true, then it
  generate all the output of time iteration and if it false, then
  only last output is available.
  . add 'return_sequence' keyward

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 api/ccapi/include/layer.h         |   1 +
 nntrainer/layers/layer_internal.h |   4 +-
 nntrainer/layers/lstm.cpp         | 118 +++++++++++++++++++++++++++++---------
 nntrainer/layers/lstm.h           |  34 ++++++-----
 nntrainer/utils/parse_util.cpp    |   4 +-
 5 files changed, 118 insertions(+), 43 deletions(-)

diff --git a/api/ccapi/include/layer.h b/api/ccapi/include/layer.h
index 9f9f963..c85f33c 100644
--- a/api/ccapi/include/layer.h
+++ b/api/ccapi/include/layer.h
@@ -111,6 +111,7 @@ public:
    * - out_dim : int ( output dimesion for embedding layer )
    * - in_length : int ( input length for embedding layer )
    * - recurrent_activation : string (type) - used only in lstm
+   * - return_sequences : bool (type) - used only in lstm
    * - distribute : bool
    */
   /**
diff --git a/nntrainer/layers/layer_internal.h b/nntrainer/layers/layer_internal.h
index 9133851..eb6d0a8 100644
--- a/nntrainer/layers/layer_internal.h
+++ b/nntrainer/layers/layer_internal.h
@@ -234,7 +234,8 @@ public:
    *            33. in_length : int ( input length for embedding layer )
    *            34. recurrent_activation :  string (type) - lstm
    *            35. distribute : bool
-   *            36.
+   *            36. split_dimension : string (type)
+   *            37. return_sequences :  bool (type) - lstm
    */
   enum class PropertyType {
     input_shape = 0,
@@ -274,6 +275,7 @@ public:
     recurrent_activation = 34,
     distribute = 35,
     split_dimension = 36,
+    return_sequences = 37,
     unknown
   };
 
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index 13d8ec7..a9e8d24 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -41,10 +41,17 @@ int LSTMLayer::initialize(Manager &manager) {
   }
 
   // input_dim = [ batch, 1, time_iteration, feature_size ]
-  // outut_dim = [ batch, 1, time_iteration, hidden_size ( unit ) ]
+  // if return_sequences == False :
+  //      output_dim = [ batch, 1, 1, hidden_size (unit)]
+  // else:
+  //      output_dim = [ batch, 1, time_iteration, hidden_size ( unit ) ]
   output_dim[0] = input_dim[0];
   output_dim[0].width(unit);
 
+  if (!return_sequences) {
+    output_dim[0].height(1);
+  }
+
   TensorDim bias_dim = TensorDim();
   bias_dim.setTensorDim(3, unit * NUM_GATE);
 
@@ -81,14 +88,20 @@ int LSTMLayer::initialize(Manager &manager) {
                                       WeightRegularizer::NONE, 1.0f, true);
   }
 
-  mem_cell =
-    std::make_shared<Var_Grad>(output_dim[0], true, true, "LSTM:mem_cell");
+  TensorDim d = input_dim[0];
+  d.width(unit);
+
+  mem_cell = std::make_shared<Var_Grad>(d, true, true, "LSTM:mem_cell");
   mem_cell->getVariableRef().setZero();
   mem_cell->getGradientRef().setZero();
 
-  TensorDim d = input_dim[0];
-  d.width(unit * NUM_GATE);
+  if (!return_sequences) {
+    hidden = std::make_shared<Var_Grad>(d, true, true, "LSTM:temp_hidden");
+    hidden->getVariableRef().setZero();
+    hidden->getGradientRef().setZero();
+  }
 
+  d.width(unit * NUM_GATE);
   fgio = std::make_shared<Var_Grad>(d, true, true, "LSTM:fgio");
   fgio->getVariableRef().setZero();
   fgio->getGradientRef().setZero();
@@ -103,6 +116,16 @@ int LSTMLayer::initialize(Manager &manager) {
   c_prev = Tensor(cell_dim);
   c_prev.setZero();
 
+  if (Layer::activation_type == ActivationType::ACT_NONE) {
+    Layer::activation_type = ActivationType::ACT_TANH;
+    acti_func.setActiFunc(activation_type);
+  }
+
+  if (recurrent_activation_type == ActivationType::ACT_NONE) {
+    recurrent_activation_type = ActivationType::ACT_SIGMOID;
+    recurrent_acti_func.setActiFunc(recurrent_activation_type);
+  }
+
   return status;
 }
 
@@ -131,6 +154,12 @@ void LSTMLayer::setProperty(const PropertyType type, const std::string &value) {
       recurrent_acti_func.setActiFunc(acti_type);
     }
     break;
+  case PropertyType::return_sequences:
+    if (!value.empty()) {
+      status = setBoolean(return_sequences, value);
+      throw_status(status);
+    }
+    break;
   default:
     Layer::setProperty(type, value);
     break;
@@ -153,7 +182,13 @@ void LSTMLayer::forwarding(bool training) {
   Tensor &bias_h =
     weightAt(static_cast<int>(LSTMParams::bias_h)).getVariableRef();
 
-  Tensor &hidden_ = net_hidden[0]->getVariableRef();
+  Tensor hidden_;
+  if (!return_sequences) {
+    hidden_ = hidden->getVariableRef();
+  } else {
+    hidden_ = net_hidden[0]->getVariableRef();
+  }
+
   Tensor &input_ = net_input[0]->getVariableRef();
   Tensor &m_cell_ = mem_cell->getVariableRef();
 
@@ -189,20 +224,20 @@ void LSTMLayer::forwarding(bool training) {
       fgio_t.add_i(bias_h);
       fgio_t.add_i(xs.dot(weight_xh));
 
-      Tensor hf = fgio_t.getSharedDataTensor({unit}, 0);
-      Tensor hg = fgio_t.getSharedDataTensor({unit}, unit);
-      Tensor hi = fgio_t.getSharedDataTensor({unit}, unit * 2);
+      Tensor hi = fgio_t.getSharedDataTensor({unit}, 0);
+      Tensor hf = fgio_t.getSharedDataTensor({unit}, unit);
+      Tensor hg = fgio_t.getSharedDataTensor({unit}, unit * 2);
       Tensor ho = fgio_t.getSharedDataTensor({unit}, unit * 3);
 
-      acti_func.run_fn(hf, hf);
-      acti_func.run_fn(hi, hi);
-      acti_func.run_fn(ho, ho);
-      recurrent_acti_func.run_fn(hg, hg);
+      recurrent_acti_func.run_fn(hf, hf);
+      recurrent_acti_func.run_fn(hi, hi);
+      recurrent_acti_func.run_fn(ho, ho);
+      acti_func.run_fn(hg, hg);
 
       hf.multiply(cs_prev, cs);
       cs.add_i(hg.multiply(hi));
 
-      recurrent_acti_func.run_fn(cs, hs);
+      acti_func.run_fn(cs, hs);
       hs.multiply_i(ho);
     }
     // size of h_prev and hs size is same : unit.
@@ -210,6 +245,16 @@ void LSTMLayer::forwarding(bool training) {
     h_prev.getBatchSlice(b, 1).copy(hs);
     c_prev.getBatchSlice(b, 1).copy(cs);
   }
+
+  if (!return_sequences) {
+    TensorDim d = hidden_.getDim();
+    for (unsigned int b = 0; b < input_dim[0].batch(); ++b) {
+      float *data = hidden_.getAddress(b * d.width() * d.height() +
+                                       (d.height() - 1) * d.width());
+      float *rdata = net_hidden[0]->getVariableRef().getAddress(b * d.width());
+      std::copy(data, data + d.width(), rdata);
+    }
+  }
 }
 
 void LSTMLayer::copy(std::shared_ptr<Layer> l) {
@@ -240,8 +285,25 @@ void LSTMLayer::calcGradient() {
   Tensor &weight_hh =
     weightAt(static_cast<int>(LSTMParams::weight_hh)).getVariableRef();
 
-  Tensor &derivative_ = net_hidden[0]->getGradientRef();
-  Tensor &hidden_ = net_hidden[0]->getVariableRef();
+  Tensor derivative_;
+  Tensor hidden_;
+
+  if (!return_sequences) {
+    derivative_ = hidden->getGradientRef();
+    TensorDim d = derivative_.getDim();
+    for (unsigned int b = 0; b < input_dim[0].batch(); ++b) {
+      float *data = derivative_.getAddress(b * d.width() * d.height() +
+                                           (d.height() - 1) * d.width());
+      float *rdata = net_hidden[0]->getGradientRef().getAddress(b * d.width());
+      std::copy(rdata, rdata + d.width(), data);
+    }
+
+    hidden_ = hidden->getVariableRef();
+  } else {
+    derivative_ = net_hidden[0]->getGradientRef();
+    hidden_ = net_hidden[0]->getVariableRef();
+  }
+
   Tensor &input_ = net_input[0]->getVariableRef();
   Tensor &m_cell_ = mem_cell->getVariableRef();
   Tensor &dm_cell_ = mem_cell->getGradientRef();
@@ -298,19 +360,19 @@ void LSTMLayer::calcGradient() {
         dh.add_i(dh_nx);
       }
 
-      Tensor dhf = dfgio_t.getSharedDataTensor({unit}, 0);
-      Tensor dhg = dfgio_t.getSharedDataTensor({unit}, unit);
-      Tensor dhi = dfgio_t.getSharedDataTensor({unit}, unit * 2);
+      Tensor dhi = dfgio_t.getSharedDataTensor({unit}, 0);
+      Tensor dhf = dfgio_t.getSharedDataTensor({unit}, unit);
+      Tensor dhg = dfgio_t.getSharedDataTensor({unit}, unit * 2);
       Tensor dho = dfgio_t.getSharedDataTensor({unit}, unit * 3);
 
-      Tensor hf = fgio_t.getSharedDataTensor({unit}, 0);
-      Tensor hg = fgio_t.getSharedDataTensor({unit}, unit);
-      Tensor hi = fgio_t.getSharedDataTensor({unit}, unit * 2);
+      Tensor hi = fgio_t.getSharedDataTensor({unit}, 0);
+      Tensor hf = fgio_t.getSharedDataTensor({unit}, unit);
+      Tensor hg = fgio_t.getSharedDataTensor({unit}, unit * 2);
       Tensor ho = fgio_t.getSharedDataTensor({unit}, unit * 3);
 
-      recurrent_acti_func.run_fn(cs, dho);
+      acti_func.run_fn(cs, dho);
       dho.multiply_i(dh);
-      recurrent_acti_func.run_prime_fn(cs, dc, ho);
+      acti_func.run_prime_fn(cs, dc, ho);
       dc.multiply_i(dh);
       dc.add_i(dc_nx);
 
@@ -319,10 +381,10 @@ void LSTMLayer::calcGradient() {
       dc.multiply(hi, dhg);
       dc.multiply(hf, dc_nx);
 
-      acti_func.run_prime_fn(ho, dho, dho);
-      acti_func.run_prime_fn(hf, dhf, dhf);
-      acti_func.run_prime_fn(hi, dhi, dhi);
-      recurrent_acti_func.run_prime_fn(hg, dhg, dhg);
+      recurrent_acti_func.run_prime_fn(ho, dho, dho);
+      recurrent_acti_func.run_prime_fn(hf, dhf, dhf);
+      recurrent_acti_func.run_prime_fn(hi, dhi, dhi);
+      acti_func.run_prime_fn(hg, dhg, dhg);
 
 
       djdb_h.add_i(dfgio_t);
diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h
index 68ee50e..9a38bdb 100644
--- a/nntrainer/layers/lstm.h
+++ b/nntrainer/layers/lstm.h
@@ -32,20 +32,17 @@ public:
   template <typename... Args>
   LSTMLayer(
     unsigned int unit_ = 0,
-    ActivationType recurrent_activation_type_ = ActivationType::ACT_SIGMOID,
-    Args... args) :
+    ActivationType recurrent_activation_type_ = ActivationType::ACT_NONE,
+    bool sequence = false, Args... args) :
     Layer(args...),
-    unit(unit_) {
-    /* Default Activation Type is tanh */
-    if (getActivationType() == ActivationType::ACT_NONE)
-      setActivation(ActivationType::ACT_TANH);
-    setRecurrentActivation(recurrent_activation_type_);
-  }
+    unit(unit_),
+    recurrent_activation_type(recurrent_activation_type_),
+    return_sequences(sequence){};
 
   /**
    * @brief     Destructor of LSTMLayer
    */
-  ~LSTMLayer(){};
+  ~LSTMLayer() = default;
 
   /**
    *  @brief  Move constructor.
@@ -125,17 +122,17 @@ private:
   unsigned int unit;
 
   /**
-   * @brief     activation function for h_t : default is tanh
+   * @brief     activation function for h_t : default is sigmoid
    */
   ActiFunc acti_func;
 
   /**
-   * @brief     activation type for recurrent : default is sigmoid
+   * @brief     activation type for recurrent : default is tanh
    */
   ActivationType recurrent_activation_type;
 
   /**
-   * @brief     activation function for recurrent : default is sigmoid
+   * @brief     activation function for recurrent : default is tanh
    */
   ActiFunc recurrent_acti_func;
 
@@ -153,11 +150,22 @@ private:
    * @brief     To save cell data
    */
   std::shared_ptr<Var_Grad> mem_cell;
-
+  
   /**
    * @brief     To save intermediate gates
    */
   std::shared_ptr<Var_Grad> fgio;
+
+  /**
+   * @brief     hidden state
+   */
+  std::shared_ptr<Var_Grad> hidden;
+  
+  /**
+   * @brief     variable to set return sequences
+   */
+  bool return_sequences;
+  
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/utils/parse_util.cpp b/nntrainer/utils/parse_util.cpp
index fe622a5..23c681e 100644
--- a/nntrainer/utils/parse_util.cpp
+++ b/nntrainer/utils/parse_util.cpp
@@ -256,6 +256,7 @@ unsigned int parseType(std::string ll, InputType t) {
  * recurrent_activation = 34
  * distribute = 35
  * split_dimension = 36
+ * return_sequences = 37
  *
  * InputLayer has 0, 1, 2, 3 properties.
  * FullyConnectedLayer has 1, 4, 6, 7, 8, 9 properties.
@@ -263,7 +264,7 @@ unsigned int parseType(std::string ll, InputType t) {
  * Pooling2DLayer has 12, 13, 14, 15 properties.
  * BatchNormalizationLayer has 0, 1, 5, 6, 7 properties.
  */
-static std::array<std::string, 38> property_string = {
+static std::array<std::string, 39> property_string = {
   "input_shape",
   "normalization",
   "standardization",
@@ -301,6 +302,7 @@ static std::array<std::string, 38> property_string = {
   "recurrent_activation",
   "distribute",
   "split_dimension",
+  "return_sequences",
   "unknown"};
 
 unsigned int parseLayerProperty(std::string property) {
-- 
2.7.4