From a89facd88a6784d941761e72723868aa0fca9b19 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Thu, 20 Jan 2022 18:06:41 +0900
Subject: [PATCH] [ SAVE/LOAD ] save / load optimizer variables

Enable save / load optimizer variables such as M and V for adam
optimizer

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 nntrainer/layers/layer_context.cpp | 28 +++++++++++++++++++++++++
 nntrainer/layers/layer_context.h   | 17 +++++++++++++++
 nntrainer/layers/layer_node.cpp    | 42 +++++++++++++++++++++++++++++---------
 nntrainer/layers/layer_node.h      |  6 ++++--
 nntrainer/models/neuralnet.cpp     | 41 ++++++++++++++++++++++++++++++++++---
 nntrainer/models/neuralnet.h       |  4 ++++
 nntrainer/tensor/weight.h          |  6 ++++++
 7 files changed, 129 insertions(+), 15 deletions(-)

diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index 92011f5..6c15112 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -164,6 +164,34 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
 }
 
 /**
+ * @brief Get the Weight Optimizer Variable tensor object
+ *
+ * @param idx Identifier of the weight
+ * @param jdx Identifier of the optimizer variables
+ * @return Tensor& Reference to the weight optimizer variable tensor
+ */
+Tensor &RunLayerContext::getWeightOptVar(unsigned int idx,
+                                         unsigned int jdx) const {
+  if (!weights[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable weight.");
+  return weights[idx]->getOptimizerVariableRef(jdx);
+}
+
+/**
+ * @brief Get the Number of Weight Optimizer Variable tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return int Number of the weight optimizer variable
+ */
+unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const {
+  if (!weights[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable weight.");
+  return weights[idx]->getNumOptVariable();
+}
+
+/**
  * @brief Get regularization loss for the weight
  *
  * @param idx Identifier of the weight
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index 3f0a0c7..ef2bf26 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -363,6 +363,15 @@ public:
   Tensor &getWeightGrad(unsigned int idx) const;
 
   /**
+   * @brief Get the Weight Optimizer Variable tensor object
+   *
+   * @param idx Identifier of the weight
+   * @param jdx Identifier of the weight optimizer variable
+   * @return Tensor& Reference to the weight grad tensor
+   */
+  Tensor &getWeightOptVar(unsigned int idx, unsigned int jdx) const;
+
+  /**
    * @brief Get the Weight name
    *
    * @param idx Identifier of the weight
@@ -580,6 +589,14 @@ public:
   unsigned int getNumWeights() const { return weights.size(); }
 
   /**
+   * @brief Get the Number of Weight Optimizer Variable tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return unsigned int Number of the weight optimizer variable
+   */
+  unsigned int getNumWeightOptVar(unsigned int idx) const;
+
+  /**
    * @brief Get the number of requested tensors objects
    *
    * @return unsigned int number of requested tensors
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index 065c90a..1a593b0 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -415,24 +415,46 @@ void LayerNode::exportTo(Exporter &exporter,
   layer->exportTo(exporter, method);
 }
 
-void LayerNode::read(std::ifstream &file) {
+void LayerNode::read(std::ifstream &file, bool opt_var) {
   NNTR_THROW_IF(!run_context, std::runtime_error)
     << __func__ << " layer needs to be finalized first!";
-  for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-    /// @note shared weights are only be read at the first acecss
-    if (run_context->isGradientLastAccess(i)) {
-      run_context->getWeight(i).read(file);
+  if (opt_var) {
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      if (run_context->isGradientLastAccess(i)) {
+        // @note read optimizer variables
+        for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
+          run_context->getWeightOptVar(i, j).read(file);
+        }
+      }
+    }
+  } else {
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      /// @note shared weights are only be read at the first acecss
+      if (run_context->isGradientLastAccess(i)) {
+        run_context->getWeight(i).read(file);
+      }
     }
   }
 }
 
-void LayerNode::save(std::ofstream &file) const {
+void LayerNode::save(std::ofstream &file, bool opt_var) const {
   NNTR_THROW_IF(!run_context, std::runtime_error)
     << __func__ << " layer needs to be finalized first!";
-  /// @note shared weights are only be saved at the first access
-  for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-    if (run_context->isGradientLastAccess(i)) {
-      run_context->getWeight(i).save(file);
+  if (opt_var) {
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      if (run_context->isGradientLastAccess(i)) {
+        // @note save optimizer variables
+        for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
+          run_context->getWeightOptVar(i, j).save(file);
+        }
+      }
+    }
+  } else {
+    // @note shared weights are only be saved at the first access
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      if (run_context->isGradientLastAccess(i)) {
+        run_context->getWeight(i).save(file);
+      }
     }
   }
 }
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 5538a88..ede2006 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -572,14 +572,16 @@ public:
   /**
    * @brief     read layer Weight & Bias data from file
    * @param file input file stream
+   * @param bool read optimizer variables
    */
-  void read(std::ifstream &file);
+  void read(std::ifstream &file, bool opt_var = false);
 
   /**
    * @brief     save layer Weight & Bias data from file
    * @param file output file stream
+   * @param bool save optimizer variables
    */
-  void save(std::ofstream &file) const;
+  void save(std::ofstream &file, bool opt_var = false) const;
 
   /**
    * @brief     get loss for the layer
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index b16b013..daa47f2 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -71,6 +71,8 @@ NeuralNetwork::NeuralNetwork(AppContext app_context_) :
   initialized(false),
   compiled(false),
   loadedFromConfig(false),
+  loadedWeight(false),
+  bin_file_pos(0),
   app_context(app_context_) {}
 
 int NeuralNetwork::loadFromConfig(const std::string &config) {
@@ -323,8 +325,19 @@ void NeuralNetwork::save(const std::string &file_path,
     for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
       (*iter)->save(model_file);
     }
+
+    opt->save(model_file);
+
+    if (opt->getType() == "adam") {
+      for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
+           iter++) {
+        (*iter)->save(model_file, true);
+      }
+    }
+
     model_file.write((char *)&epoch_idx, sizeof(epoch_idx));
     model_file.write((char *)&iter, sizeof(iter));
+
     model_file.close();
     break;
   }
@@ -361,13 +374,28 @@ void NeuralNetwork::load(const std::string &file_path,
 
     auto model_file = checkedOpenStream<std::ifstream>(
       file_path, std::ios::in | std::ios::binary);
-    for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
-      (*iter)->read(model_file);
+    if (!loadedWeight) {
+      for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
+           iter++) {
+        (*iter)->read(model_file);
+      }
+      loadedWeight = true;
+      bin_file_pos = model_file.tellg();
     }
-
     try {
       /// this is assuming that the failure is allowed at the end of the file
       /// read. so, after this line, additional read shouldn't be called
+      model_file.seekg(bin_file_pos);
+
+      std::string opt_type;
+      opt_type = readString(model_file);
+      if (istrequal(opt_type, "adam") && istrequal(opt->getType(), "adam")) {
+        for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
+             iter++) {
+          (*iter)->read(model_file, true);
+        }
+      }
+
       checkedRead(model_file, (char *)&epoch_idx, sizeof(epoch_idx),
                   "[NeuralNetwork::readModel] failed to read epoch_idx");
       checkedRead(model_file, (char *)&iter, sizeof(iter),
@@ -604,6 +632,11 @@ int NeuralNetwork::train(const std::vector<std::string> &values) {
   status = allocate(ExecutionMode::TRAIN);
   NN_RETURN_STATUS();
 
+  // @note Need to be here to read the optimizer variables
+  if (!load_path.empty()) {
+    load(load_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
+  }
+
   status = train_run();
   NN_RETURN_STATUS();
 
@@ -800,6 +833,8 @@ void swap(NeuralNetwork &lhs, NeuralNetwork &rhs) {
     swap(lhs.graph_representation, rhs.graph_representation);
     swap(lhs.compiled, rhs.compiled);
     swap(lhs.loadedFromConfig, rhs.loadedFromConfig);
+    swap(lhs.loadedWeight, rhs.loadedWeight);
+    swap(lhs.bin_file_pos, rhs.bin_file_pos);
   }
 }
 
diff --git a/nntrainer/models/neuralnet.h b/nntrainer/models/neuralnet.h
index 0e86017..e87c4c1 100644
--- a/nntrainer/models/neuralnet.h
+++ b/nntrainer/models/neuralnet.h
@@ -530,6 +530,10 @@ private:
 
   bool loadedFromConfig; /**< Check if config is loaded to prevent load twice */
 
+  bool loadedWeight; /**< Check if weight is loaded to prevent load twice */
+
+  uint64_t bin_file_pos; /**< save file position to load later*/
+
   RunStats validation; /** validation statistics of the model */
   RunStats training;   /** training statistics of the model */
   RunStats testing;    /** testing statistics of the model */
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index c5c6841..d13ae1b 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -198,6 +198,12 @@ public:
   Tensor &getOptimizerVariableRef(unsigned int idx) { return *opt_vars[idx]; }
 
   /**
+   * @brief Get number of optimizer variable
+   * @retval number of optimizer variable
+   */
+  int getNumOptVariable() { return opt_vars.size(); }
+
+  /**
    * @brief     check if weight regularizer type is l2norm
    * @return    bool is weight regrulatizer type is L2 Norm
    */
-- 
2.7.4