From fdb84ac99c4915e87981f6f01d21147ebfc1745d Mon Sep 17 00:00:00 2001
From: Parichay Kapoor <pk.kapoor@samsung.com>
Date: Wed, 25 Aug 2021 13:48:46 +0900
Subject: [PATCH] [Manager] Use TensorPool for Gradients

Use TensorPool for gradients of the weights.

Signed-off-by: Parichay Kapoor <pk.kapoor@samsung.com>
---
 jni/Android.mk                     |   1 +
 nntrainer/graph/network_graph.cpp  |  15 +++--
 nntrainer/layers/layer_context.cpp |  23 ++++++++
 nntrainer/layers/layer_context.h   |  16 +++++
 nntrainer/layers/layer_node.h      |  12 ++++
 nntrainer/tensor/manager.cpp       | 117 ++++++++++++++++++++++---------------
 nntrainer/tensor/manager.h         |  11 ++++
 7 files changed, 141 insertions(+), 54 deletions(-)

diff --git a/jni/Android.mk b/jni/Android.mk
index 9a21c91..796772c 100644
--- a/jni/Android.mk
+++ b/jni/Android.mk
@@ -139,6 +139,7 @@ NNTRAINER_SRCS := $(NNTRAINER_ROOT)/nntrainer/models/neuralnet.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/var_grad.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/weight.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/tensor_dim.cpp \
+                  $(NNTRAINER_ROOT)/nntrainer/tensor/tensor_pool.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/memory_pool.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/basic_planner.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/blas_interface.cpp \
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index c978565..63d7ee9 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -430,12 +430,15 @@ void NetworkGraph::setBatchSize(unsigned int batch_size) {
 
   for (auto iter = cbegin(); iter != cend(); iter++) {
     (*iter)->setBatch(batch_size);
-    const InitLayerContext &init_context = (*iter)->getInitContext();
-    // resize tensors spec
-    for (auto const &ts : init_context.getTensorsSpec()) {
-      tensor_manager->setBatchSize(std::get<3>(ts), batch_size);
-      tensor_manager->setBatchSize(std::get<3>(ts) + Var_Grad::grad_suffix,
-                                   batch_size);
+    if ((*iter)->isRunContextAvailable()) {
+      const RunLayerContext &context = (*iter)->getRunContext();
+      // resize tensors spec
+      for (unsigned int idx = 0; idx < context.getNumTensors(); idx++) {
+        auto const &ts = context.getTensor(idx);
+        tensor_manager->setBatchSize(ts.getName(), batch_size);
+        auto const &ts_grad = context.getTensorGrad(idx);
+        tensor_manager->setBatchSize(ts_grad.getName(), batch_size);
+      }
     }
   }
   tensor_manager->setBatchSize(batch_size);
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index c65b0ed..bccf574 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -181,6 +181,16 @@ Tensor &RunLayerContext::getTensor(unsigned int idx) {
 }
 
 /**
+ * @brief Get the Tensor object
+ *
+ * @param idx Identifier of the tensor
+ * @return Tensor& Reference to the tensor
+ */
+const Tensor &RunLayerContext::getTensor(unsigned int idx) const {
+  return tensors[idx]->getVariableRef();
+}
+
+/**
  * @brief Get the Tensor Grad object
  *
  * @param idx Identifier of the tensor
@@ -194,6 +204,19 @@ Tensor &RunLayerContext::getTensorGrad(unsigned int idx) {
 }
 
 /**
+ * @brief Get the Tensor Grad object
+ *
+ * @param idx Identifier of the tensor
+ * @return Tensor& Reference to the tensor grad tensor
+ */
+const Tensor &RunLayerContext::getTensorGrad(unsigned int idx) const {
+  if (!tensors[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable tensor.");
+  return tensors[idx]->getGradientRef();
+}
+
+/**
  * @brief check if the tensor has gradient
  *
  * @param idx Identifier of the tensor
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index 54e19ff..a09a040 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -429,6 +429,14 @@ public:
   Tensor &getTensor(unsigned int idx);
 
   /**
+   * @brief Get the Tensor object
+   *
+   * @param idx Identifier of the tensor
+   * @return Tensor& Reference to the tensor
+   */
+  const Tensor &getTensor(unsigned int idx) const;
+
+  /**
    * @brief Get the Tensor Grad object
    *
    * @param idx Identifier of the tensor
@@ -437,6 +445,14 @@ public:
   Tensor &getTensorGrad(unsigned int idx);
 
   /**
+   * @brief Get the Tensor Grad object
+   *
+   * @param idx Identifier of the tensor
+   * @return Tensor& Reference to the tensor grad tensor
+   */
+  const Tensor &getTensorGrad(unsigned int idx) const;
+
+  /**
    * @brief check if the tensor has gradient
    *
    * @param idx Identifier of the tensor
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 88a6670..abeb71e 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -561,6 +561,18 @@ public:
   }
 
   /**
+   * @brief   check if run layer context is available
+   *
+   * @retval  bool true if context is available else false
+   */
+  bool isRunContextAvailable() const {
+    if (!run_context)
+      return false;
+
+    return true;
+  }
+
+  /**
    * @brief Set the Run Context object with given tensor packs
    *
    * @param weights weights
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index f769291..f2536ab 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -324,17 +324,14 @@ void Manager::deallocateWeights() {
 }
 
 void Manager::allocateGradients() {
-  /** Allocate the source tensors for shared memories */
-  if (!shared_grad.empty())
-    shared_grad.allocate();
-
   if (LAYER_V2) {
     for (auto &w : weights_v2) {
       w->allocateOptimizerVariables();
     }
-    if (tensor_pool.minMemoryRequirement() > 0)
-      tensor_pool.allocate();
   } else {
+    /** Allocate the source tensors for shared memories */
+    if (!shared_grad.empty())
+      shared_grad.allocate();
     for (auto &l_w : weights) {
       for (auto &w : l_w) {
         Weight &weight = w.get();
@@ -345,14 +342,12 @@ void Manager::allocateGradients() {
 }
 
 void Manager::deallocateGradients() {
-  shared_grad.deallocate();
-
   if (LAYER_V2) {
     for (auto &w : weights_v2) {
       w->deallocateOptimizerVariables();
     }
-    tensor_pool.deallocate();
   } else {
+    shared_grad.deallocate();
     for (auto &l_w : weights) {
       for (auto &w : l_w) {
         Weight &weight = w.get();
@@ -366,9 +361,7 @@ void Manager::deallocateGradients() {
  * @brief Initialize the weight gradients
  */
 void Manager::initializeGradients() {
-  if (LAYER_V2) {
-    tensor_pool.finalize(BasicPlanner(), 0, max_exec_order);
-  } else {
+  if (!LAYER_V2) {
     if (total_weight_size == 0) {
       ml_logw(
         "Nothing done on initialize because there is no weight registered");
@@ -510,9 +503,9 @@ void Manager::allocateInOuts() {
     for (auto &out : outputs_v2) {
       out->allocateVariable();
     }
-    for (auto &t : tensors_v2) {
-      t->allocateVariable();
-    }
+    // for (auto &t : tensors_v2) {
+    //   t->allocateVariable();
+    // }
   } else {
     for (auto &l_io : in_outs) {
       for (auto &io : l_io) {
@@ -532,9 +525,9 @@ void Manager::deallocateInOuts() {
     for (auto &out : outputs_v2) {
       out->deallocateVariable();
     }
-    for (auto &t : tensors_v2) {
-      t->deallocateVariable();
-    }
+    // for (auto &t : tensors_v2) {
+    //   t->deallocateVariable();
+    // }
   } else {
     for (auto &l_io : in_outs) {
       for (auto &io : l_io) {
@@ -556,9 +549,9 @@ void Manager::allocateDerivatives() {
     for (auto &out : outputs_v2) {
       out->allocateGradient();
     }
-    for (auto &t : tensors_v2) {
-      t->allocateGradient();
-    }
+    // for (auto &t : tensors_v2) {
+    //   t->allocateGradient();
+    // }
   } else {
     for (auto &l_io : in_outs) {
       for (auto &io : l_io) {
@@ -578,9 +571,9 @@ void Manager::deallocateDerivatives() {
     for (auto &out : outputs_v2) {
       out->deallocateGradient();
     }
-    for (auto &t : tensors_v2) {
-      t->deallocateGradient();
-    }
+    // for (auto &t : tensors_v2) {
+    //   t->deallocateGradient();
+    // }
   } else {
     for (auto &l_io : in_outs) {
       for (auto &io : l_io) {
@@ -653,9 +646,9 @@ void Manager::initializeTensorsInference() {
     }
 
     // Inference Mode without optimizations
-    for (auto &ts : tensors_v2) {
-      ts->initialize(Tensor(), Tensor(), false);
-    }
+    // for (auto &ts : tensors_v2) {
+    //   ts->initialize(Tensor(), Tensor(), false);
+    // }
 
     // In inference mode, do not allocate the memory for the input of the first
     // layer. These is the first entry in the in_outs. Inference() will override
@@ -672,11 +665,10 @@ void Manager::initializeTensorsTrain() {
   // Initialize gradients
   initializeGradients();
 
-  // Initialize shared derivative memory
-  if (max_derivative_size > 0 && enable_activation_memory_opt)
-    shared_deriv = Tensor(TensorDim({max_derivative_size}), false);
-
   if (!LAYER_V2) {
+    // Initialize shared derivative memory
+    if (max_derivative_size > 0 && enable_activation_memory_opt)
+      shared_deriv = Tensor(TensorDim({max_derivative_size}), false);
     for (unsigned int idx = 0; idx < in_outs.size(); idx++) {
       auto &l_io = in_outs[idx];
       unsigned int offset = 0;
@@ -703,15 +695,17 @@ void Manager::initializeTensorsTrain() {
       }
     }
   } else {
+    tensor_pool.finalize(BasicPlanner(), 0, max_exec_order);
+
     // Training Mode without optimizations
     for (auto &outs : outputs_v2) {
       outs->initialize(Tensor(), Tensor(), true);
     }
 
     // Training Mode without optimizations
-    for (auto &ts : tensors_v2) {
-      ts->initialize(Tensor(), Tensor(), true);
-    }
+    // for (auto &ts : tensors_v2) {
+    //   ts->initialize(Tensor(), Tensor(), true);
+    // }
 
     // Training Mode without optimizations
     for (auto &ins : inputs_v2) {
@@ -814,34 +808,61 @@ Manager::requestWeights(const GraphNode &node,
 std::vector<Var_Grad *>
 Manager::requestTensors(const GraphNode &node,
                         const std::vector<Var_Grad::Spec> &tensors_spec) {
-  auto ret = requestTensors<Var_Grad>(node, tensors_spec, tensors_v2);
   const auto &exec_order = node.getExecutionOrder();
-  for (unsigned int idx = 0; idx < ret.size(); idx++) {
-    auto const &t = ret[idx];
-    auto const &vname = t->getName();
-    auto const &gname = t->getGradientName();
-    auto const &tspan = std::get<4>(tensors_spec[idx]);
+
+  std::vector<Var_Grad *> ret;
+  size_t current_size = tensors_v2.size();
+
+  for (auto const &ts : std::as_const(tensors_spec)) {
+    auto const &tspan = std::get<4>(ts);
+    std::vector<unsigned int> var_exec_order;
+    std::vector<unsigned int> grad_exec_order;
 
     /** usage for tensors */
     if (enum_class_logical_and<TensorLifespan>(
           tspan, TensorLifespan::FORWARD_FUNC_LIFESPAN))
-      tensor_exec_order[vname].push_back(std::get<0>(exec_order));
+      var_exec_order.push_back(std::get<0>(exec_order));
 
     /** usage for tensors gradient in backwarding */
     if (enum_class_logical_and<TensorLifespan>(
           tspan, TensorLifespan::BACKWARD_FUNC_LIFESPAN)) {
-      tensor_exec_order[vname].push_back(std::get<1>(exec_order));
-      tensor_exec_order[gname].push_back(std::get<1>(exec_order));
+      var_exec_order.push_back(std::get<1>(exec_order));
+      grad_exec_order.push_back(std::get<1>(exec_order));
 
-      tensor_exec_order[vname].push_back(std::get<2>(exec_order));
-      tensor_exec_order[gname].push_back(std::get<2>(exec_order));
+      var_exec_order.push_back(std::get<2>(exec_order));
+      grad_exec_order.push_back(std::get<2>(exec_order));
     }
 
-    /** set tensor lifespan */
-    expandLifespan(vname, tspan);
-    expandLifespan(gname, tspan);
+    Tensor *var =
+      tensor_pool.requestTensor(std::get<0>(ts), /// tensor dim
+                                var_exec_order,
+                                tspan,           /// lifespan
+                                std::get<3>(ts), /// name
+                                std::get<1>(ts)  /// tensor initializer
+      );
+    max_exec_order =
+      std::max(max_exec_order,
+               *std::max_element(var_exec_order.begin(), var_exec_order.end()));
+
+    Tensor *grad = nullptr;
+    // TODO: change to enum_class_and
+    if (std::get<2>(ts) /** need gradient */ &&
+        enum_class_or(tspan, TensorLifespan::FORWARD_FUNC_LIFESPAN) !=
+          TensorLifespan::FORWARD_FUNC_LIFESPAN)
+      grad = tensor_pool.requestTensor(
+        std::get<0>(ts), /// tensor dim
+        grad_exec_order, tspan,
+        std::get<3>(ts) + Var_Grad::grad_suffix, /// name
+        Tensor::Initializer::ZEROS               /// tensor initializer
+      );
+
+    tensors_v2.emplace_back(std::make_unique<Var_Grad>(var, grad));
   }
 
+  std::transform(tensors_v2.begin() + current_size, tensors_v2.end(),
+                 std::back_inserter(ret),
+                 [](auto const &elem) { return elem.get(); });
+
   return ret;
 }
 
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index b4254e2..789e33d 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -26,6 +26,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <basic_planner.h>
 #include <graph_node.h>
 #include <tensor_pool.h>
 #include <var_grad.h>
@@ -366,6 +367,11 @@ public:
    */
   void initializeTensors(bool training);
 
+  /**
+   * @brief   Check if the manager has allocated tensors
+   *
+   * @return true if tensors allocated, else false
+   */
   bool isAllocated() const { return tensors_allocated; }
 
   /**
@@ -412,11 +418,15 @@ public:
       allocateWeights();
 
     if (!tensors_allocated) {
+      tensor_pool.finalize(BasicPlanner(), 0, max_exec_order);
       if (model_training)
         allocateGradients();
       allocateInOuts();
       if (model_training)
         allocateDerivatives();
+
+      if (tensor_pool.minMemoryRequirement() > 0)
+        tensor_pool.allocate();
       tensors_allocated = true;
     }
   }
@@ -435,6 +445,7 @@ public:
       if (model_training)
         deallocateDerivatives();
 
+      tensor_pool.deallocate();
       tensors_allocated = false;
     }
   }
-- 
2.7.4