From 1dc26fa13f4cf0d14b8120bc9eb5ffda06ab91a4 Mon Sep 17 00:00:00 2001
From: Parichay Kapoor <pk.kapoor@samsung.com>
Date: Tue, 27 Jul 2021 15:47:48 +0900
Subject: [PATCH] [weight] Weight cleanup related to initializer

This patch provides weight cleanup related to the initializer.
Weights dont take care of the initializer but rather let var_grad handle
it which in turn let tensor handle it.

Signed-off-by: Parichay Kapoor <pk.kapoor@samsung.com>
---
 nntrainer/tensor/tensor.cpp   | 10 +++++--
 nntrainer/tensor/var_grad.cpp |  9 +++++-
 nntrainer/tensor/var_grad.h   |  4 ++-
 nntrainer/tensor/weight.cpp   | 69 ++-----------------------------------------
 nntrainer/tensor/weight.h     | 27 +++--------------
 5 files changed, 24 insertions(+), 95 deletions(-)
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index bafbdc2..06f8e6f 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -153,10 +153,12 @@ void Tensor::allocate() {
     data = std::shared_ptr<float>(src_tensor->tensor()->data,
                                   src_tensor->tensor()->data.get() +
                                     src_tensor->offset());
+    /** as this memory is shared, do NOT initialize */
   } else {
     /// allocate new memory for the tensor data
     data = std::shared_ptr<float>(new float[dim.getDataLen()],
                                   std::default_delete<float[]>());
+    initialize();
   }
 }
 
@@ -541,9 +543,11 @@ void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
    * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
    * if the batch size of src is updated and needs reallocation.
    */
-  if (src.data)
-    dest.data = std::shared_ptr<float>(src.data, src.data.get() + offset);
-  else if (!src.src_tensor)
+  dest.data = nullptr;
+  if (src.data) {
+    dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
+    dest.allocate();
+  } else if (!src.src_tensor)
     dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
   else
     dest.src_tensor = std::make_shared<SrcSharedTensor>(
diff --git a/nntrainer/tensor/var_grad.cpp b/nntrainer/tensor/var_grad.cpp
index ab5a09c..8eb2435 100644
--- a/nntrainer/tensor/var_grad.cpp
+++ b/nntrainer/tensor/var_grad.cpp
@@ -26,6 +26,10 @@ Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init,
   name(name) {
   var = std::make_shared<Tensor>(dim, alloc_now, init);
   if (need_gradient)
+    /**
+     * @todo gradient initializer should be none, and then they should be set
+     * zero right before using by the user itself.
+     */
     grad = std::make_shared<Tensor>(dim, alloc_now, Tensor::Initializer::ZEROS);
   else
     grad = std::make_shared<Tensor>();
@@ -34,6 +38,7 @@ Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init,
 void Var_Grad::initializeVariable(const Tensor &preallocated) {
   if (!preallocated.empty()) {
     var->makeSharedDataTensor(preallocated);
+    /** intentionally not initialized tensor memory for shared tensors */
   }
 }
 
@@ -44,6 +49,7 @@ void Var_Grad::initializeGradient(const Tensor &preallocated) {
      * with other layers but the internal memory is.
      */
     grad->makeSharedDataTensor(preallocated);
+    /** intentionally not initialized tensor memory for shared tensors */
   }
   /**
    * No need to reset gradient here. With shared memory, each gradient setting
@@ -58,7 +64,8 @@ void Var_Grad::needsGradient(bool ng) {
   need_gradient = ng;
   if (need_gradient && grad->empty()) {
     bool alloc_now_ = var->isAllocated();
-    grad = std::make_shared<Tensor>(var->getDim(), alloc_now_);
+    grad =
+      std::make_shared<Tensor>(dim, alloc_now_, Tensor::Initializer::ZEROS);
   }
 }
 
diff --git a/nntrainer/tensor/var_grad.h b/nntrainer/tensor/var_grad.h
index 0f003f1..d3700b6 100644
--- a/nntrainer/tensor/var_grad.h
+++ b/nntrainer/tensor/var_grad.h
@@ -234,10 +234,12 @@ public:
    *
    * @note New dimension must maintain the shape of the variable
    */
-  void reset(const TensorDim &tdim, bool ng) {
+  void reset(const TensorDim &tdim, Tensor::Initializer init, bool ng) {
     dim = tdim;
     if (!var->empty())
       var->reshape(dim);
+    var->initialize(init);
+
     if (!grad->empty())
       grad->reshape(dim);
     need_gradient = ng;
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index a7de41f..50db0ca 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -22,79 +22,14 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
                const WeightRegularizer reg, const float reg_const, bool train,
                bool alloc_now_, std::string name) :
   Var_Grad(dim, init, train, alloc_now_, name),
-  initializer(init),
   regularizer(reg),
   regularizer_constant(reg_const) {
-  if (initializer == Tensor::Initializer::NONE)
-    throw std::invalid_argument("Weight initializer unknown");
+  if (init == Tensor::Initializer::NONE)
+    throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
 }
 
-void Weight::initializeVariable(const Tensor &preallocated) {
-  Var_Grad::initializeVariable(preallocated);
-
-  if (alloc_now)
-    runVariableInitializer();
-}
-
-void Weight::runVariableInitializer() {
-  Tensor &var_ref = getVariableRef();
-  const TensorDim dim = var_ref.getDim();
-
-  unsigned int fan_in, fan_out;
-
-  /// @fixme: when unit is equal to one, this does not work, we need to rely on
-  /// effective dimension then actual numbers here. For now, some heuristics
-  /// added to infer what would be fan_in/fan_out
-  if (dim.batch() * dim.channel() * dim.height() == 1) {
-    fan_out = fan_in = dim.width();
-  } else if (dim.batch() * dim.channel() == 1) { /// fully connected layers
-    fan_in = dim.height();
-    fan_out = dim.width();
-  } else { /// convolution filters, @todo extend this to > 4
-    auto field_size = dim.height() * dim.width();
-
-    // this also handles below cases.
-    // 1. fan_in = fan_out = 1 as well.
-    // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
-    fan_in = dim.channel() * field_size;
-    fan_out = dim.batch() * field_size;
-  }
-
-  switch (initializer) {
-  case Tensor::Initializer::ZEROS:
-    var_ref.setZero();
-    break;
-  case Tensor::Initializer::ONES:
-    var_ref.setValue(1.0f);
-    break;
-  case Tensor::Initializer::LECUN_NORMAL:
-    var_ref.setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
-    break;
-  case Tensor::Initializer::XAVIER_NORMAL:
-    var_ref.setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
-    break;
-  case Tensor::Initializer::HE_NORMAL:
-    var_ref.setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
-    break;
-  case Tensor::Initializer::LECUN_UNIFORM:
-    var_ref.setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in),
-                           sqrtFloat(1.0f / fan_in));
-    break;
-  case Tensor::Initializer::XAVIER_UNIFORM:
-    var_ref.setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
-                           sqrtFloat(6.0 / (fan_in + fan_out)));
-    break;
-  case Tensor::Initializer::HE_UNIFORM:
-    var_ref.setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
-                           sqrtFloat(6.0 / (fan_in)));
-    break;
-  default:
-    break;
-  }
-}
-
 void Weight::initializeGradient(const Tensor &preallocated) {
   // Use self variable to initialize itself
   Var_Grad::initializeGradient(preallocated);
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 71a343f..6724772 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -41,7 +41,6 @@ public:
    */
   Weight() :
     Var_Grad(),
-    initializer(Tensor::Initializer::NONE),
     regularizer(WeightRegularizer::UNKNOWN),
     regularizer_constant(1.0f) {}
 
@@ -94,16 +93,10 @@ public:
    */
   explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "") :
     Var_Grad(v, g, n),
-    initializer(Tensor::Initializer::XAVIER_UNIFORM),
     regularizer(WeightRegularizer::NONE),
     regularizer_constant(1.0f) {}
 
   /**
-   * @copydoc var_grad::initializeVariable(const Tensor &)
-   */
-  void initializeVariable(const Tensor &preallocated = Tensor());
-
-  /**
    * @copydoc var_grad::initializeGradient(const Tensor &)
    */
   void initializeGradient(const Tensor &preallocated = Tensor());
@@ -118,7 +111,6 @@ public:
   friend void swap(Weight &lhs, Weight &rhs) noexcept {
     using std::swap;
     swap(static_cast<Var_Grad &>(lhs), static_cast<Var_Grad &>(rhs));
-    swap(lhs.initializer, rhs.initializer);
     swap(lhs.regularizer, rhs.regularizer);
   }
 
@@ -179,11 +171,10 @@ public:
    */
   void reset(const TensorDim &dim, const Tensor::Initializer init,
              const WeightRegularizer reg, const float reg_const, bool ng) {
-    initializer = init;
     regularizer = reg;
     regularizer_constant = reg_const;
 
-    Var_Grad::reset(dim, ng);
+    Var_Grad::reset(dim, init, ng);
   }
 
   /**
@@ -213,17 +204,13 @@ public:
   /**
    * @brief Allocate and initialize the weight variable, if needed
    */
-  void allocateVariable() {
-    Var_Grad::allocateVariable();
-    runVariableInitializer();
-  }
+  void allocateVariable() { Var_Grad::allocateVariable(); }
 
   /**
    * @brief Allocate and initialize the weight gradient, if needed
    */
   void allocateGradient() {
     Var_Grad::allocateGradient();
-    resetGradient();
     allocateOptimizerVariables();
   }
 
@@ -275,19 +262,13 @@ public:
   }
 
 private:
-  Tensor::Initializer initializer; /**< initializer for this variable */
-  WeightRegularizer regularizer;   /**< regularizer for this variable */
-  float regularizer_constant;      /**< constant factor for regularization */
+  WeightRegularizer regularizer; /**< regularizer for this variable */
+  float regularizer_constant;    /**< constant factor for regularization */
 
   std::vector<Tensor> opt_vars;        /**< optimizer variables */
   std::vector<TensorDim> opt_vars_dim; /**< optimizer variables dimensions */
 
   /**
-   * @brief Initialize the weight with the initializer
-   */
-  void runVariableInitializer();
-
-  /**
    * @brief Allocate optimizer related variables for the given weights
    */
   void allocateOptimizerVariables();
-- 
2.7.4