From 369eb18771229c118ecb7e71fd82fcfccd5270e3 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Tue, 22 Jun 2021 09:53:12 +0900
Subject: [PATCH] [ Recurrent ] Implement Dropout for Recurrent Net

In this commit, drop out for recurrent network is intrduced.
dropout property is introduced and if the element value of random
tensor is smaller than dropout rate, then it will be set zero.
The element which is not set zero, then it will scale with
1.0/(1.0-dropout).

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 api/ccapi/include/layer.h         |  1 +
 nntrainer/layers/gru.cpp          | 10 ++++++++++
 nntrainer/layers/gru.h            | 10 ++++++++--
 nntrainer/layers/layer_internal.h |  2 ++
 nntrainer/layers/lstm.cpp         | 11 +++++++++++
 nntrainer/layers/lstm.h           | 10 ++++++++--
 nntrainer/layers/rnn.cpp          | 10 ++++++++++
 nntrainer/layers/rnn.h            | 10 ++++++++--
 nntrainer/tensor/tensor.cpp       | 18 ++++++++++++++++++
 nntrainer/tensor/tensor.h         |  7 +++++++
 nntrainer/utils/parse_util.cpp    |  4 +++-
 11 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/api/ccapi/include/layer.h b/api/ccapi/include/layer.h
index 759ce7a..794269d 100644
--- a/api/ccapi/include/layer.h
+++ b/api/ccapi/include/layer.h
@@ -116,6 +116,7 @@ public:
    * - return_sequences : bool (type) - used only in lstm
    * - distribute : bool
    * - hidden_state_activation : string (type) - used only in lstm
+   * - drop_out : float (type) - drop out rate
    */
   /**
    * @brief     set Property of layer
diff --git a/nntrainer/layers/gru.cpp b/nntrainer/layers/gru.cpp
index f5cb230..cb8fc85 100644
--- a/nntrainer/layers/gru.cpp
+++ b/nntrainer/layers/gru.cpp
@@ -161,6 +161,12 @@ void GRULayer::setProperty(const PropertyType type, const std::string &value) {
       throw_status(status);
     }
     break;
+  case PropertyType::dropout:
+    if (!value.empty()) {
+      status = setFloat(dropout_rate, value);
+      throw_status(status);
+    }
+    break;
   default:
     LayerV1::setProperty(type, value);
     break;
@@ -207,6 +213,10 @@ void GRULayer::forwarding(bool training) {
     for (unsigned int t = 0; t < islice.height(); ++t) {
       Tensor xs =
         islice.getSharedDataTensor({islice.width()}, t * islice.width());
+
+      if (dropout_rate > 0.0 && training) {
+        xs.multiply_i(xs.dropout_mask(dropout_rate));
+      }
       hs = oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
       Tensor zrg_t =
         zrg_.getSharedDataTensor({unit * NUM_GATE}, unit * t * NUM_GATE);
diff --git a/nntrainer/layers/gru.h b/nntrainer/layers/gru.h
index f920799..d318d29 100644
--- a/nntrainer/layers/gru.h
+++ b/nntrainer/layers/gru.h
@@ -34,12 +34,13 @@ public:
     unsigned int unit_ = 0,
     ActivationType hidden_state_activation_type_ = ActivationType::ACT_NONE,
     ActivationType recurrent_activation_type_ = ActivationType::ACT_NONE,
-    bool sequence = false, Args... args) :
+    bool sequence = false, float dropout = 0.0, Args... args) :
     LayerV1(args...),
     unit(unit_),
     hidden_state_activation_type(hidden_state_activation_type_),
     recurrent_activation_type(recurrent_activation_type_),
-    return_sequences(sequence){};
+    return_sequences(sequence),
+    dropout_rate(dropout){};
 
   /**
    * @brief     Destructor of GRULayer
@@ -162,6 +163,11 @@ private:
    * @brief     variable to set return sequences
    */
   bool return_sequences;
+
+  /**
+   * @brief     drop out rate
+   */
+  float dropout_rate;
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/layer_internal.h b/nntrainer/layers/layer_internal.h
index 3e03848..527005a 100644
--- a/nntrainer/layers/layer_internal.h
+++ b/nntrainer/layers/layer_internal.h
@@ -248,6 +248,7 @@ public:
    *            36. split_dimension : string (type)
    *            37. return_sequences :  bool (type) - lstm
    *            39. hidden_state_activation :  string (type) - lstm
+   *            40. dropout :  float (type) - drop out rate
    */
   enum class PropertyType {
     input_shape = 0,
@@ -289,6 +290,7 @@ public:
     split_dimension = 36,
     return_sequences = 37,
     hidden_state_activation = 38,
+    dropout = 39,
     unknown
   };
 
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index 199ff7a..a45b01e 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -146,6 +146,12 @@ void LSTMLayer::setProperty(const PropertyType type, const std::string &value) {
       throw_status(status);
     }
     break;
+  case PropertyType::dropout:
+    if (!value.empty()) {
+      status = setFloat(dropout_rate, value);
+      throw_status(status);
+    }
+    break;
   default:
     LayerV1::setProperty(type, value);
     break;
@@ -194,6 +200,11 @@ void LSTMLayer::forwarding(bool training) {
     for (unsigned int t = 0; t < islice.height(); ++t) {
       Tensor xs =
         islice.getSharedDataTensor({islice.width()}, t * islice.width());
+
+      if (dropout_rate > 0.0 && training) {
+        xs.multiply_i(xs.dropout_mask(dropout_rate));
+      }
+
       hs = oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
       cs = cell.getSharedDataTensor({cell.width()}, t * cell.width());
       Tensor fgio_t =
diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h
index c88893a..c1f228f 100644
--- a/nntrainer/layers/lstm.h
+++ b/nntrainer/layers/lstm.h
@@ -34,12 +34,13 @@ public:
     unsigned int unit_ = 0,
     ActivationType hidden_state_activation_type_ = ActivationType::ACT_NONE,
     ActivationType recurrent_activation_type_ = ActivationType::ACT_NONE,
-    bool sequence = false, Args... args) :
+    bool sequence = false, float dropout = 0.0, Args... args) :
     LayerV1(args...),
     unit(unit_),
     hidden_state_activation_type(hidden_state_activation_type_),
     recurrent_activation_type(recurrent_activation_type_),
-    return_sequences(sequence){};
+    return_sequences(sequence),
+    dropout_rate(dropout){};
 
   /**
    * @brief     Destructor of LSTMLayer
@@ -172,6 +173,11 @@ private:
    * @brief     variable to set return sequences
    */
   bool return_sequences;
+
+  /**
+   * @brief     drop out rate
+   */
+  float dropout_rate;
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/rnn.cpp b/nntrainer/layers/rnn.cpp
index 0373ca7..941de0a 100644
--- a/nntrainer/layers/rnn.cpp
+++ b/nntrainer/layers/rnn.cpp
@@ -122,6 +122,12 @@ void RNNLayer::setProperty(const PropertyType type, const std::string &value) {
       throw_status(status);
     }
     break;
+  case PropertyType::dropout:
+    if (!value.empty()) {
+      status = setFloat(dropout_rate, value);
+      throw_status(status);
+    }
+    break;
   default:
     LayerV1::setProperty(type, value);
     break;
@@ -160,6 +166,10 @@ void RNNLayer::forwarding(bool training) {
       Tensor xs =
         islice.getSharedDataTensor({islice.width()}, t * islice.width());
 
+      if (dropout_rate > 0.0 && training) {
+        xs.multiply_i(xs.dropout_mask(dropout_rate));
+      }
+
       hs = oslice.getSharedDataTensor({oslice.width()}, t * oslice.width());
       if (t > 0) {
         hs_prev = oslice.getSharedDataTensor({oslice.width()},
diff --git a/nntrainer/layers/rnn.h b/nntrainer/layers/rnn.h
index 28b7a2e..7954d13 100644
--- a/nntrainer/layers/rnn.h
+++ b/nntrainer/layers/rnn.h
@@ -33,11 +33,12 @@ public:
   RNNLayer(
     unsigned int unit_ = 0,
     ActivationType hidden_state_activation_type_ = ActivationType::ACT_NONE,
-    bool sequence = false, Args... args) :
+    bool sequence = false, float dropout = 0.0, Args... args) :
     LayerV1(args...),
     unit(unit_),
     hidden_state_activation_type(hidden_state_activation_type_),
-    return_sequences(sequence){};
+    return_sequences(sequence),
+    dropout_rate(dropout){};
 
   /**
    * @brief     Destructor of RNNLayer
@@ -127,6 +128,11 @@ private:
   bool return_sequences;
 
   /**
+   * @brief     drop out rate
+   */
+  float dropout_rate;
+
+  /**
    * @brief     hidden variable for rnn
    */
   std::shared_ptr<Var_Grad> hidden;
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 6b95376..30f645f 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -870,6 +870,24 @@ Tensor Tensor::transpose(const std::string &direction) const {
   return result;
 }
 
+Tensor Tensor::dropout_mask(float dropout) const {
+  Tensor result(dim);
+  result.setValue(1.0);
+  Tensor rand_temp(dim);
+  rand_temp.setRandUniform(0.0, 1.0);
+  float scale = 1.0 / (1 - dropout);
+
+  float *mask = result.getData();
+  float *random = rand_temp.getData();
+  for (unsigned int i = 0; i < length(); ++i) {
+    if (random[i] >= dropout)
+      mask[i] = mask[i] * scale;
+    else
+      mask[i] = 0.0;
+  }
+  return result;
+}
+
 int Tensor::apply_i(std::function<float(float)> f) {
   float *data = getData();
 
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 5a98da7..84a2338 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -515,6 +515,13 @@ public:
   Tensor &transpose(const std::string &direction, Tensor &out) const;
 
   /**
+   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
+   * @param dropout drop out rate
+   * @retval Tensor& reference of drop out mask
+   */
+  Tensor dropout_mask(float dropout) const;
+
+  /**
    * @brief     sum all the Tensor elements according to the batch
    * @retval    Calculated Tensor(batch, 1, 1, 1)
    */
diff --git a/nntrainer/utils/parse_util.cpp b/nntrainer/utils/parse_util.cpp
index ee15de0..5c34b68 100644
--- a/nntrainer/utils/parse_util.cpp
+++ b/nntrainer/utils/parse_util.cpp
@@ -258,6 +258,7 @@ unsigned int parseType(std::string ll, InputType t) {
  * split_dimension = 36
  * return_sequences = 37
  * hidden_state_activation = 38
+ * dropout = 39
  *
  * InputLayer has 0, 1, 2, 3 properties.
  * FullyConnectedLayer has 1, 4, 6, 7, 8, 9 properties.
@@ -265,7 +266,7 @@ unsigned int parseType(std::string ll, InputType t) {
  * Pooling2DLayer has 12, 13, 14, 15 properties.
  * BatchNormalizationLayer has 0, 1, 5, 6, 7 properties.
  */
-static std::array<std::string, 40> property_string = {
+static std::array<std::string, 41> property_string = {
   "input_shape",
   "normalization",
   "standardization",
@@ -305,6 +306,7 @@ static std::array<std::string, 40> property_string = {
   "split_dimension",
   "return_sequences",
   "hidden_state_activation",
+  "dropout",
   "unknown"};
 
 unsigned int parseLayerProperty(std::string property) {
-- 
2.7.4