From 1dc26fa13f4cf0d14b8120bc9eb5ffda06ab91a4 Mon Sep 17 00:00:00 2001 From: Parichay Kapoor Date: Tue, 27 Jul 2021 15:47:48 +0900 Subject: [PATCH] [weight] Weight cleanup related to initializer This patch provides weight cleanup related to the initializer. Weights dont take care of the initializer but rather let var_grad handle it which in turn let tensor handle it. Signed-off-by: Parichay Kapoor --- nntrainer/tensor/tensor.cpp | 10 +++++-- nntrainer/tensor/var_grad.cpp | 9 +++++- nntrainer/tensor/var_grad.h | 4 ++- nntrainer/tensor/weight.cpp | 69 ++----------------------------------------- nntrainer/tensor/weight.h | 27 +++-------------- 5 files changed, 24 insertions(+), 95 deletions(-) diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index bafbdc2..06f8e6f 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -153,10 +153,12 @@ void Tensor::allocate() { data = std::shared_ptr(src_tensor->tensor()->data, src_tensor->tensor()->data.get() + src_tensor->offset()); + /** as this memory is shared, do NOT initialize */ } else { /// allocate new memory for the tensor data data = std::shared_ptr(new float[dim.getDataLen()], std::default_delete()); + initialize(); } } @@ -541,9 +543,11 @@ void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest, * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored * if the batch size of src is updated and needs reallocation. */ - if (src.data) - dest.data = std::shared_ptr(src.data, src.data.get() + offset); - else if (!src.src_tensor) + dest.data = nullptr; + if (src.data) { + dest.src_tensor = std::make_shared(&src, offset); + dest.allocate(); + } else if (!src.src_tensor) dest.src_tensor = std::make_shared(&src, offset); else dest.src_tensor = std::make_shared( diff --git a/nntrainer/tensor/var_grad.cpp b/nntrainer/tensor/var_grad.cpp index ab5a09c..8eb2435 100644 --- a/nntrainer/tensor/var_grad.cpp +++ b/nntrainer/tensor/var_grad.cpp @@ -26,6 +26,10 @@ Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init, name(name) { var = std::make_shared(dim, alloc_now, init); if (need_gradient) + /** + * @todo gradient initializer should be none, and then they should be set + * zero right before using by the user itself. + */ grad = std::make_shared(dim, alloc_now, Tensor::Initializer::ZEROS); else grad = std::make_shared(); @@ -34,6 +38,7 @@ Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init, void Var_Grad::initializeVariable(const Tensor &preallocated) { if (!preallocated.empty()) { var->makeSharedDataTensor(preallocated); + /** intentionally not initialized tensor memory for shared tensors */ } } @@ -44,6 +49,7 @@ void Var_Grad::initializeGradient(const Tensor &preallocated) { * with other layers but the internal memory is. */ grad->makeSharedDataTensor(preallocated); + /** intentionally not initialized tensor memory for shared tensors */ } /** * No need to reset gradient here. With shared memory, each gradient setting @@ -58,7 +64,8 @@ void Var_Grad::needsGradient(bool ng) { need_gradient = ng; if (need_gradient && grad->empty()) { bool alloc_now_ = var->isAllocated(); - grad = std::make_shared(var->getDim(), alloc_now_); + grad = + std::make_shared(dim, alloc_now_, Tensor::Initializer::ZEROS); } } diff --git a/nntrainer/tensor/var_grad.h b/nntrainer/tensor/var_grad.h index 0f003f1..d3700b6 100644 --- a/nntrainer/tensor/var_grad.h +++ b/nntrainer/tensor/var_grad.h @@ -234,10 +234,12 @@ public: * * @note New dimension must maintain the shape of the variable */ - void reset(const TensorDim &tdim, bool ng) { + void reset(const TensorDim &tdim, Tensor::Initializer init, bool ng) { dim = tdim; if (!var->empty()) var->reshape(dim); + var->initialize(init); + if (!grad->empty()) grad->reshape(dim); need_gradient = ng; diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index a7de41f..50db0ca 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -22,79 +22,14 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init, const WeightRegularizer reg, const float reg_const, bool train, bool alloc_now_, std::string name) : Var_Grad(dim, init, train, alloc_now_, name), - initializer(init), regularizer(reg), regularizer_constant(reg_const) { - if (initializer == Tensor::Initializer::NONE) - throw std::invalid_argument("Weight initializer unknown"); + if (init == Tensor::Initializer::NONE) + throw std::invalid_argument("Weight initializer cannot be none"); if (regularizer == WeightRegularizer::UNKNOWN) throw std::invalid_argument("Weight regularizer unknown"); } -void Weight::initializeVariable(const Tensor &preallocated) { - Var_Grad::initializeVariable(preallocated); - - if (alloc_now) - runVariableInitializer(); -} - -void Weight::runVariableInitializer() { - Tensor &var_ref = getVariableRef(); - const TensorDim dim = var_ref.getDim(); - - unsigned int fan_in, fan_out; - - /// @fixme: when unit is equal to one, this does not work, we need to rely on - /// effective dimension then actual numbers here. For now, some heuristics - /// added to infer what would be fan_in/fan_out - if (dim.batch() * dim.channel() * dim.height() == 1) { - fan_out = fan_in = dim.width(); - } else if (dim.batch() * dim.channel() == 1) { /// fully connected layers - fan_in = dim.height(); - fan_out = dim.width(); - } else { /// convolution filters, @todo extend this to > 4 - auto field_size = dim.height() * dim.width(); - - // this also handles below cases. - // 1. fan_in = fan_out = 1 as well. - // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1 - fan_in = dim.channel() * field_size; - fan_out = dim.batch() * field_size; - } - - switch (initializer) { - case Tensor::Initializer::ZEROS: - var_ref.setZero(); - break; - case Tensor::Initializer::ONES: - var_ref.setValue(1.0f); - break; - case Tensor::Initializer::LECUN_NORMAL: - var_ref.setRandNormal(0.0f, sqrtFloat(1.0f / fan_in)); - break; - case Tensor::Initializer::XAVIER_NORMAL: - var_ref.setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out))); - break; - case Tensor::Initializer::HE_NORMAL: - var_ref.setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in))); - break; - case Tensor::Initializer::LECUN_UNIFORM: - var_ref.setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), - sqrtFloat(1.0f / fan_in)); - break; - case Tensor::Initializer::XAVIER_UNIFORM: - var_ref.setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)), - sqrtFloat(6.0 / (fan_in + fan_out))); - break; - case Tensor::Initializer::HE_UNIFORM: - var_ref.setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)), - sqrtFloat(6.0 / (fan_in))); - break; - default: - break; - } -} - void Weight::initializeGradient(const Tensor &preallocated) { // Use self variable to initialize itself Var_Grad::initializeGradient(preallocated); diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 71a343f..6724772 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -41,7 +41,6 @@ public: */ Weight() : Var_Grad(), - initializer(Tensor::Initializer::NONE), regularizer(WeightRegularizer::UNKNOWN), regularizer_constant(1.0f) {} @@ -94,16 +93,10 @@ public: */ explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "") : Var_Grad(v, g, n), - initializer(Tensor::Initializer::XAVIER_UNIFORM), regularizer(WeightRegularizer::NONE), regularizer_constant(1.0f) {} /** - * @copydoc var_grad::initializeVariable(const Tensor &) - */ - void initializeVariable(const Tensor &preallocated = Tensor()); - - /** * @copydoc var_grad::initializeGradient(const Tensor &) */ void initializeGradient(const Tensor &preallocated = Tensor()); @@ -118,7 +111,6 @@ public: friend void swap(Weight &lhs, Weight &rhs) noexcept { using std::swap; swap(static_cast(lhs), static_cast(rhs)); - swap(lhs.initializer, rhs.initializer); swap(lhs.regularizer, rhs.regularizer); } @@ -179,11 +171,10 @@ public: */ void reset(const TensorDim &dim, const Tensor::Initializer init, const WeightRegularizer reg, const float reg_const, bool ng) { - initializer = init; regularizer = reg; regularizer_constant = reg_const; - Var_Grad::reset(dim, ng); + Var_Grad::reset(dim, init, ng); } /** @@ -213,17 +204,13 @@ public: /** * @brief Allocate and initialize the weight variable, if needed */ - void allocateVariable() { - Var_Grad::allocateVariable(); - runVariableInitializer(); - } + void allocateVariable() { Var_Grad::allocateVariable(); } /** * @brief Allocate and initialize the weight gradient, if needed */ void allocateGradient() { Var_Grad::allocateGradient(); - resetGradient(); allocateOptimizerVariables(); } @@ -275,19 +262,13 @@ public: } private: - Tensor::Initializer initializer; /**< initializer for this variable */ - WeightRegularizer regularizer; /**< regularizer for this variable */ - float regularizer_constant; /**< constant factor for regularization */ + WeightRegularizer regularizer; /**< regularizer for this variable */ + float regularizer_constant; /**< constant factor for regularization */ std::vector opt_vars; /**< optimizer variables */ std::vector opt_vars_dim; /**< optimizer variables dimensions */ /** - * @brief Initialize the weight with the initializer - */ - void runVariableInitializer(); - - /** * @brief Allocate optimizer related variables for the given weights */ void allocateOptimizerVariables(); -- 2.7.4