From: Jihoon Lee <jhoon.it.lee@samsung.com>
Date: Fri, 11 Mar 2022 15:42:54 +0000 (+0900)
Subject: Change loading meta information behavior
X-Git-Tag: accepted/tizen/unified/20220323.062643~5
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=78d35926337dcc9161216a8f651443f0fa1c24af;p=platform%2Fcore%2Fml%2Fnntrainer.git

Change loading meta information behavior

**Before this PR**

optimizer variable loaded from load_path every time.
Calling model->train(); in a row became unintuitive

1. model->train() load from original load path
thus iteration number roll back to the first one.
2. Same happens for the adam weight
3. model->load(); after model->initialize(); is noop
because loadedWeight becomes true

**After this PR**

1. model load from load_path only at initialize time
2. model->load is not implicitly overriden

**Additional Changes**

1. optimizer weight became part of weights. Now available after initialize()
2. Save format became coherent with load format
3. Some unused variables deleted

**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Jihoon Lee <jhoon.it.lee@samsung.com>
---

diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index 6c15112..d565611 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -172,9 +172,6 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
  */
 Tensor &RunLayerContext::getWeightOptVar(unsigned int idx,
                                          unsigned int jdx) const {
-  if (!weights[idx]->hasGradient())
-    throw std::invalid_argument(
-      "Requesting gradient for a non-trainable weight.");
   return weights[idx]->getOptimizerVariableRef(jdx);
 }
 
@@ -185,9 +182,6 @@ Tensor &RunLayerContext::getWeightOptVar(unsigned int idx,
  * @return int Number of the weight optimizer variable
  */
 unsigned int RunLayerContext::getNumWeightOptVar(unsigned int idx) const {
-  if (!weights[idx]->hasGradient())
-    throw std::invalid_argument(
-      "Requesting gradient for a non-trainable weight.");
   return weights[idx]->getNumOptVariable();
 }
 
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index f9e6d50..0820c59 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -421,12 +421,9 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
   if (opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       if (run_context->isGradientLastAccess(i) && getTrainable()) {
-        // @note read optimizer variables
-        if (run_context->weightHasGradient(i)) {
-          for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
-               ++j) {
-            run_context->getWeightOptVar(i, j).read(file);
-          }
+        /// @note read optimizer variables
+        for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
+          run_context->getWeightOptVar(i, j).read(file);
         }
       }
     }
@@ -466,6 +463,19 @@ void LayerNode::save(std::ofstream &file, bool opt_var) const {
   }
 }
 
+void LayerNode::clearOptVar() {
+  NNTR_THROW_IF(!run_context, std::runtime_error)
+    << __func__ << " layer needs to be finalized first!";
+  for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+    if (run_context->isGradientLastAccess(i) && getTrainable()) {
+      /// @note read optimizer variables
+      for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i); ++j) {
+        run_context->getWeightOptVar(i, j).initialize();
+      }
+    }
+  }
+}
+
 /**
  * @brief     Finalize creating the layer node
  */
@@ -624,7 +634,8 @@ void LayerNode::setBatch(unsigned int batch) {
  * @brief   If the current layer can support in-place
  */
 bool LayerNode::supportInPlace() const {
-  ///@note below is a quick fix, we need to have a guard that this shouldn't be
+  ///@note below is a quick fix, we need to have a guard that this shouldn't
+  /// be
   /// query until realizeProps has been finalized ( which means we will need
   /// another end point to fixate this property )
   if (getDistribute()) {
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index ede2006..9a50f22 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -584,6 +584,12 @@ public:
   void save(std::ofstream &file, bool opt_var = false) const;
 
   /**
+   * @brief clear optimizer variable to initial state
+   *
+   */
+  void clearOptVar();
+
+  /**
    * @brief     get loss for the layer
    * @return    loss of the layer
    */
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index 8eaba4d..b7a30e0 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -71,8 +71,6 @@ NeuralNetwork::NeuralNetwork(AppContext app_context_) :
   initialized(false),
   compiled(false),
   loadedFromConfig(false),
-  loadedWeight(false),
-  bin_file_pos(0),
   app_context(app_context_) {}
 
 int NeuralNetwork::loadFromConfig(const std::string &config) {
@@ -189,6 +187,8 @@ int NeuralNetwork::initialize() {
     std::get<props::TrainingBatchSize>(model_flex_props));
 
   // initialize optimizer and related variables
+  /// @todo: initialize should take a mode and check if mode is train but
+  /// optimizer is not given, make it as a hard error
   if (opt) {
     /** TODO: update request of optimizer to be of same format as
      * Layer::requestTensor */
@@ -205,11 +205,7 @@ int NeuralNetwork::initialize() {
 
   initialized = true;
 
-  // @note we need check loadedWeight for the case of multiple call of load to
-  // load weight. Only the weight needs to be loaded here. Becuase the buffer
-  // for the optimizer is not allocated yet.
-  // loadedWeight check is just for the duplicate load of weight.
-  if (!load_path.empty() && !loadedWeight) {
+  if (!load_path.empty()) {
     load(load_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
   }
 
@@ -328,14 +324,12 @@ void NeuralNetwork::save(const std::string &file_path,
   switch (format) {
   case ml::train::ModelFormat::MODEL_FORMAT_BIN: {
     auto model_file = checkedOpenStream<std::ofstream>(
-      file_path, std::ios::out | std::ios::binary);
+      file_path, std::ios::out | std::ios::binary | std::ios::trunc);
     for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
       (*iter)->save(model_file);
     }
-
-    opt->save(model_file);
-
-    if (istrequal(opt->getType(), "adam")) {
+    if (opt && istrequal(opt->getType(), "adam")) {
+      model_file.write("adam", 4);
       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
            iter++) {
         (*iter)->save(model_file, true);
@@ -381,22 +375,13 @@ void NeuralNetwork::load(const std::string &file_path,
 
     auto model_file = checkedOpenStream<std::ifstream>(
       file_path, std::ios::in | std::ios::binary);
-    if (!loadedWeight) {
-      for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
-           iter++) {
-        (*iter)->read(model_file);
-      }
-      loadedWeight = true;
-      bin_file_pos = model_file.tellg();
-      load_path = file_path;
-      return;
+    for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
+      (*iter)->read(model_file);
     }
     try {
       /// this is assuming that the failure is allowed at the end of the file
       /// read. so, after this line, additional read shouldn't be called
-      model_file.seekg(bin_file_pos);
-
-      if (istrequal(opt->getType(), "adam")) {
+      if (opt && istrequal(opt->getType(), "adam")) {
         char opt_type[4];
         model_file.read(opt_type, 4);
         if (istrequal(opt_type, "adam")) {
@@ -412,7 +397,8 @@ void NeuralNetwork::load(const std::string &file_path,
       checkedRead(model_file, (char *)&iter, sizeof(iter),
                   "[NeuralNetwork::readModel] failed to read iteration");
     } catch (...) {
-      std::cerr << "failed to read epoch idx, proceeding with default index\n";
+      std::cerr << "failed to read additional data like optimizer variable, "
+                   "iteration, proceeding with default\n";
     }
 
     ml_logi("read modelfile: %s", file_path.c_str());
@@ -644,11 +630,6 @@ int NeuralNetwork::train(const std::vector<std::string> &values) {
   status = allocate(ExecutionMode::TRAIN);
   NN_RETURN_STATUS();
 
-  // @note Need to be here to read the optimizer variables
-  if (!load_path.empty()) {
-    load(load_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
-  }
-
   status = train_run();
   NN_RETURN_STATUS();
 
@@ -668,10 +649,14 @@ int NeuralNetwork::train_run() {
   int status = ML_ERROR_NONE;
 
   if (!std::get<props::ContinueTrain>(model_flex_props)) {
-    epoch_idx = 0;
     iter = 0;
+    for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
+      (*iter)->clearOptVar();
+    }
   }
 
+  epoch_idx = 0;
+
   auto batch_size = std::get<props::TrainingBatchSize>(model_flex_props);
 
   auto const &outputs = model_graph.getOutputTensors();
@@ -845,8 +830,6 @@ void swap(NeuralNetwork &lhs, NeuralNetwork &rhs) {
     swap(lhs.graph_representation, rhs.graph_representation);
     swap(lhs.compiled, rhs.compiled);
     swap(lhs.loadedFromConfig, rhs.loadedFromConfig);
-    swap(lhs.loadedWeight, rhs.loadedWeight);
-    swap(lhs.bin_file_pos, rhs.bin_file_pos);
   }
 }
 
diff --git a/nntrainer/models/neuralnet.h b/nntrainer/models/neuralnet.h
index b12a476..8b4a19a 100644
--- a/nntrainer/models/neuralnet.h
+++ b/nntrainer/models/neuralnet.h
@@ -530,10 +530,6 @@ private:
 
   bool loadedFromConfig; /**< Check if config is loaded to prevent load twice */
 
-  bool loadedWeight; /**< Check if weight is loaded to prevent load twice */
-
-  uint64_t bin_file_pos; /**< save file position to load later*/
-
   RunStats validation; /** validation statistics of the model */
   RunStats training;   /** training statistics of the model */
   RunStats testing;    /** testing statistics of the model */
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 7b212e0..4a88081 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -597,13 +597,15 @@ bool Manager::isSecondLastAccess(const std::string &name,
 std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   const std::vector<TensorDim> &dims, const std::string &name,
   const TensorLifespan &lifespan, Tensor::Initializer initializer) {
-  auto const &exec_order = weight_pool.getExecutionOrder(name);
+  auto const exec_order = weight_pool.getExecutionOrder(name);
 
   std::vector<Tensor *> ret;
   ret.reserve(dims.size());
 
+  /// @note this is assuming weight optimizer variables is treated as weight, if
+  /// not, there is room to optimize below behavior
   for (unsigned int idx = 0; idx < dims.size(); idx++)
-    ret.push_back(tensor_pool.request(name + ":opt" + std::to_string(idx),
+    ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx),
                                       dims[idx], exec_order, lifespan,
                                       initializer));