From 9d6e0d01c81904aa3012870ac8e98cd9c424e36a Mon Sep 17 00:00:00 2001 From: Parichay Kapoor Date: Fri, 24 Sep 2021 20:58:10 +0900 Subject: [PATCH] [batchnorm] Optimize batch norm layer This patch optimizes batch norm layer and tries to share the calculations performed in calcGradient and calcDerivative. - reuse dbeta and dgamma calculations - reduce number of required temporary variables - create all the required tensor variables with context - add support for checking if the layer is trainable or not via run context - support average operation with the output tensor already allocated - this patch reduces as much as memory as possible without sacrificing speed. more memory optimization is possible at the expense of speed but has been ommitted for now. Note: this patch has slight improvement in performance, and adds no extra operations. Signed-off-by: Parichay Kapoor --- nntrainer/layers/bn_layer.cpp | 120 +++++++++++++++++++++------ nntrainer/layers/bn_layer.h | 9 +- nntrainer/layers/layer_context.cpp | 5 +- nntrainer/layers/layer_context.h | 16 +++- nntrainer/layers/layer_node.cpp | 7 +- nntrainer/layers/time_dist.cpp | 24 +++--- nntrainer/tensor/tensor.cpp | 33 +++++++- nntrainer/tensor/tensor.h | 22 +++++ test/unittest/layers/layers_golden_tests.cpp | 4 +- 9 files changed, 182 insertions(+), 58 deletions(-) diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp index b2ecc18..443a811 100644 --- a/nntrainer/layers/bn_layer.cpp +++ b/nntrainer/layers/bn_layer.cpp @@ -32,11 +32,22 @@ namespace nntrainer { static constexpr size_t SINGLE_INOUT_IDX = 0; -enum BNParams { mu, var, gamma, beta, deviation }; +enum BNParams { + mu, + var, + gamma, + beta, + deviation, + invstd, + t_full_fw, + t_full_bw, + t_reduced +}; BatchNormalizationLayer::BatchNormalizationLayer(int axis_) : Layer(), axis(axis_), + divider(0), wt_idx({0}), bn_props(props::Epsilon(), props::BNPARAMS_MU_INIT(), props::BNPARAMS_VAR_INIT(), props::BNPARAMS_BETA_INIT(), @@ -66,11 +77,21 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { if (axis == -1) axis = in_dim.channel() > 1 ? 1 : 3; + /** + * @todo This can be speedup by employing transpose for convolution. With + * transpose, the channel dimension can be made last, and the remaining + * dimensions can be squeezed. This would allow the sum and average to be + * faster, and no temporary allocations inside them. + */ + dim.setTensorDim(axis, in_dim.getTensorDim(axis)); + divider = 1; for (int i = 0; i < 4; ++i) { - if (axis != i) + if (axis != i) { axes_to_reduce.push_back(i); + divider *= in_dim.getTensorDim(i); + } } wt_idx[BNParams::mu] = @@ -86,9 +107,33 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f, context.getName() + ":beta", true); + /** + * caches the deviation -> input - avg(input) + * @todo check if avoiding this storage and adding dependency on input (no + * more in-place calculation) can save memory during memory optimization. + */ wt_idx[BNParams::deviation] = context.requestTensor( in_dim, context.getName() + ":deviation", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); + /** caches the inverse standard deviation */ + wt_idx[BNParams::invstd] = context.requestTensor( + dim, context.getName() + ":invstd", Tensor::Initializer::NONE, false, + TensorLifespan::ITERATION_LIFESPAN); + /** + * Temporary tensor to store the reduced tensors along the axes_to_reduce. + * This is further used to cache variance + epsilon as well. + */ + wt_idx[BNParams::t_reduced] = context.requestTensor( + dim, context.getName() + ":tensor_reduced", Tensor::Initializer::NONE, + false, TensorLifespan::ITERATION_LIFESPAN); + /** Temporary tensor to store the full sized tensors in forwarding. */ + wt_idx[BNParams::t_full_fw] = context.requestTensor( + in_dim, context.getName() + ":tensor_full_fw", Tensor::Initializer::NONE, + false, TensorLifespan::FORWARD_FUNC_LIFESPAN); + /** Temporary tensor to store the full sized tensors in backwarding. */ + wt_idx[BNParams::t_full_bw] = context.requestTensor( + in_dim, context.getName() + ":tensor_full_back", Tensor::Initializer::NONE, + false, TensorLifespan::BACKWARD_FUNC_LIFESPAN); } void BatchNormalizationLayer::setProperty( @@ -112,20 +157,23 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context, Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]); + Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]); - if (training) { - /** - * @todo support average with preallocated tensors, - * and then register cmu as a temporary tensor - */ - Tensor cmu = input_.average(axes_to_reduce); - input_.subtract(cmu, deviation); + /** @todo these are not needed for inference, support optimizing these */ + Tensor &t_reduced = context.getTensor(wt_idx[BNParams::t_reduced]); + Tensor &t_full = context.getTensor(wt_idx[BNParams::t_full_fw]); + Tensor cvar = t_reduced; /** cache the variance in this tensor for backward */ - Tensor t1 = deviation.pow(2.0f); - cvar = t1.average(axes_to_reduce); + if (training) { + input_.average(axes_to_reduce, t_reduced); + input_.subtract(t_reduced, deviation); mu.multiply_i(momentum); - mu.add_i(cmu, 1 - momentum); + mu.add_i(t_reduced, 1 - momentum); + + t_full = deviation.pow(2.0f); + t_full.average(axes_to_reduce, cvar); + var.multiply_i(momentum); var.add_i(cvar, 1 - momentum); @@ -133,6 +181,7 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context, cvar.pow(-0.5f, invstd); } else { input_.subtract(mu, deviation); + /** @todo do below 2 lines only for first iteration */ var.add(epsilon, invstd); invstd.pow_i(-0.5f); } @@ -148,30 +197,49 @@ void BatchNormalizationLayer::calcDerivative(RunLayerContext &context) { Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX); Tensor &dx = context.getOutgoingDerivative(SINGLE_INOUT_IDX); Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]); + Tensor &invstd = context.getTensor(wt_idx[BNParams::invstd]); + + Tensor &t_reduced = context.getTensor(wt_idx[BNParams::t_reduced]); + Tensor &t_full = context.getTensor(wt_idx[BNParams::t_full_bw]); + Tensor cvar = t_reduced; + + if (context.getTrainable()) { + /** + * This implementation depends on the pre-calculated dbeta calculated. + */ + Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]); + t_reduced = dbeta.divide(divider); + } else { + t_reduced = deriv.average(axes_to_reduce); + } + + deriv.subtract(t_reduced, dx); - Tensor dx_1 = gamma.multiply(invstd); - Tensor dx_2 = deriv.subtract(deriv.average(axes_to_reduce)); + deviation.multiply(deriv, t_full); + t_full.average(axes_to_reduce, t_reduced); + t_reduced.divide_i(cvar); + deviation.multiply_i(t_reduced); + dx.subtract_i(deviation); - Tensor t1 = deviation.multiply(deriv); - Tensor t2 = t1.average(axes_to_reduce); - deviation.divide_i(cvar); - deviation.multiply_i(t2); - dx_2.subtract_i(deviation); + if (context.getTrainable()) { + /** + * This calculates dgamma tensor. + */ + Tensor &dgamma = context.getWeightGrad(wt_idx[BNParams::gamma]); + t_full.multiply_i(invstd); + t_full.sum(axes_to_reduce, dgamma); + } - dx_2.multiply(dx_1, dx); + invstd.multiply_i(gamma); + dx.multiply_i(invstd); } void BatchNormalizationLayer::calcGradient(RunLayerContext &context) { - - Tensor &dgamma = context.getWeightGrad(wt_idx[BNParams::gamma]); + /** dgamma is calculated in calcDerivative. dbeta is calculated here */ Tensor &dbeta = context.getWeightGrad(wt_idx[BNParams::beta]); Tensor &deriv = context.getIncomingDerivative(SINGLE_INOUT_IDX); - Tensor &deviation = context.getTensor(wt_idx[BNParams::deviation]); deriv.sum(axes_to_reduce, dbeta); - Tensor dev = deviation.multiply(deriv); - dev.multiply_i(invstd); - dev.sum(axes_to_reduce, dgamma); } void BatchNormalizationLayer::exportTo(Exporter &exporter, diff --git a/nntrainer/layers/bn_layer.h b/nntrainer/layers/bn_layer.h index 4e7d0f8..c985564 100644 --- a/nntrainer/layers/bn_layer.h +++ b/nntrainer/layers/bn_layer.h @@ -113,14 +113,11 @@ public: inline static const std::string type = "batch_normalization"; private: - Tensor cvar; /**< training variance saved in bn_layer::forwarding and used in - bn_layer::calcDerivative */ - Tensor invstd; /**< inversed training std for backward pass */ - - int axis; /**< Target axis, axis inferred at initialize when -1 */ + int axis; /**< Target axis, axis inferred at initialize when -1 */ + float divider; /**< size of the axes of the reduced */ std::vector axes_to_reduce; /**< target axes to reduce */ - std::array wt_idx; /**< indices of the weights and tensors */ + std::array wt_idx; /**< indices of the weights and tensors */ std::tuple diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index 99c8400..40c85e9 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -15,8 +15,8 @@ #include namespace nntrainer { -RunLayerContext::RunLayerContext(const std::string &name, float l, - const std::vector &w, +RunLayerContext::RunLayerContext(const std::string &name, bool trainable, + float l, const std::vector &w, const std::vector &in, const std::vector &out, const std::vector &t) : @@ -26,6 +26,7 @@ RunLayerContext::RunLayerContext(const std::string &name, float l, outputs(out), tensors(t) { std::get(props).set(name); + std::get(props).set(trainable); NNTR_THROW_IF(!readyToUse(), std::invalid_argument) << "run context is not ready to use upon creation"; } diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index a09a040..db27f5d 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -295,6 +295,9 @@ private: * structures with memory allocated or support to allocate any new memory, but * rather only support storing specifications based on which memory will be * allocated later. + * + * @todo Check the caller of the getTensor() and set restrictions on the tensors + * to be accessed based on which function is requesting it. */ class RunLayerContext { public: @@ -307,7 +310,7 @@ public: * @param out outputs of the layer * @param t extra tensors of the layer */ - RunLayerContext(const std::string &name, float l, + RunLayerContext(const std::string &name, bool trainable, float l, const std::vector &w, const std::vector &in, const std::vector &out, @@ -573,6 +576,13 @@ public: const std::string &getName() const { return std::get(props); } /** + * @brief get trainable by the layer + * + * @return trainable of the layer + */ + bool getTrainable() const { return std::get(props); } + + /** * @brief check if run context is set and is ready to use * * @return true if ready, else false @@ -580,8 +590,8 @@ public: bool readyToUse() const; private: - std::tuple props; /**< props of the layer */ - float loss; /**< loss of the layer */ + std::tuple props; /**< props of the layer */ + float loss; /**< loss of the layer */ std::vector weights; /**< weights of the layer */ std::vector inputs; /**< inputs of the layer */ diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp index 8b0fb8b..9d64f01 100644 --- a/nntrainer/layers/layer_node.cpp +++ b/nntrainer/layers/layer_node.cpp @@ -485,7 +485,8 @@ void LayerNode::calcDerivative() { */ void LayerNode::calcGradient() { START_PROFILE(calc_grad_event_key); - layer->calcGradient(*run_context); + if (getTrainable()) + layer->calcGradient(*run_context); END_PROFILE(calc_grad_event_key); } @@ -537,8 +538,8 @@ void LayerNode::configureRunContext(const std::vector &weights, const std::vector &inputs, const std::vector &outputs, const std::vector &tensors) { - run_context = std::make_unique(getName(), 0.0f, weights, - inputs, outputs, tensors); + run_context = std::make_unique( + getName(), getTrainable(), 0.0f, weights, inputs, outputs, tensors); } /** diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp index 80e9894..bb7799c 100644 --- a/nntrainer/layers/time_dist.cpp +++ b/nntrainer/layers/time_dist.cpp @@ -248,9 +248,9 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) { out_var.initializeGradient(label_iter); } - RunLayerContext dist_context(context.getName(), context.getLoss(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + RunLayerContext dist_context(context.getName(), context.getTrainable(), + context.getLoss(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->forwarding(dist_context, training); } @@ -294,9 +294,9 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) { out_var.initializeGradient(d_iter); out_var.initializeVariable(hval_iter); - RunLayerContext dist_context(context.getName(), context.getLoss(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + RunLayerContext dist_context(context.getName(), context.getTrainable(), + context.getLoss(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcDerivative(dist_context); } @@ -346,9 +346,9 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) { in_var.initializeVariable(in_iter); out_var.initializeGradient(d_iter); - RunLayerContext dist_context(context.getName(), context.getLoss(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + RunLayerContext dist_context(context.getName(), context.getTrainable(), + context.getLoss(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcGradient(dist_context); } @@ -389,9 +389,9 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) { fillWeightsFromContext(context); fillTensorsFromContext(context); - RunLayerContext dist_context(context.getName(), context.getLoss(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + RunLayerContext dist_context(context.getName(), context.getTrainable(), + context.getLoss(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->setBatch(dist_context, batch); diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 9abc409..6072034 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -1180,20 +1180,36 @@ void Tensor::read(std::ifstream &file) { * @brief Calculate average value according to the axis. */ Tensor Tensor::average(unsigned int axis) const { + Tensor t; + return average(axis, t); +} + +/** + * @brief Calculate average value according to the axis. + */ +Tensor &Tensor::average(unsigned int axis, Tensor &output) const { if (axis >= TensorDim::MAXDIM) throw std::out_of_range( "negative axis or axis more then MAXDIM is invalid"); unsigned int axis_size = dim.getDim()[axis]; if (axis_size == 1) - return this->clone(); + output.copy(*this); + else + this->sum(axis, output, 1.0 / ((float)axis_size)); - return this->sum(axis, 1.0 / ((float)axis_size)); + return output; } Tensor Tensor::average(const std::vector &axes) const { + Tensor t; + return average(axes, t); +} + +Tensor &Tensor::average(const std::vector &axes, + Tensor &output) const { if (axes.empty()) - return this->average(); + return this->average(output); TensorDim ret_shape; for (const auto &idx : axes) { @@ -1203,7 +1219,7 @@ Tensor Tensor::average(const std::vector &axes) const { ret_shape.setTensorDim(idx, dim.getTensorDim(idx)); } - return this->sum(axes, 1.0 / (float)ret_shape.getDataLen()); + return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen()); } /** @@ -1215,6 +1231,15 @@ Tensor Tensor::average() const { return result.average(3); } +/** + * @brief Calculate average value according to the axis. + */ +Tensor &Tensor::average(Tensor &output) const { + Tensor result = *this; + result.reshape({1, 1, 1, dim.getDataLen()}); + return result.average(3, output); +} + void Tensor::setValue(float val) { float *data = getData(); std::fill(data, data + size(), val); diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 7c9a64c..5dc1e92 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -597,6 +597,7 @@ public: */ Tensor &sum(const std::vector &axes, Tensor &output, float alpha = 1.0) const; + /** * @brief Averaging the Tensor elements according to the axis * 0 : batch direction @@ -606,6 +607,12 @@ public: * @retval Calculated Tensor */ Tensor average(unsigned int axis) const; + /** + * @brief Averaging the Tensor elements according to the axis + * + * @retval Calculated Tensor + */ + Tensor &average(unsigned int axis, Tensor &output) const; /** * @brief average all the Tensor by multiple axes @@ -616,12 +623,27 @@ public: Tensor average(const std::vector &axes) const; /** + * @brief average all the Tensor by multiple axes + * + * @param axes axes to sum along + * @param output output tensor + * @return Tensor + */ + Tensor &average(const std::vector &axes, Tensor &output) const; + + /** * @brief Averaging the Tensor elements by all axis * @retval Calculated Tensor */ Tensor average() const; /** + * @brief Averaging the Tensor elements by all axis + * @retval Calculated Tensor + */ + Tensor &average(Tensor &output) const; + + /** * @brief Anchor a starting point to defer following evaluation * @retval LazyTensor class that can be used with run(); */ diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp index dac2641..eef15e9 100644 --- a/test/unittest/layers/layers_golden_tests.cpp +++ b/test/unittest/layers/layers_golden_tests.cpp @@ -131,8 +131,8 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) { }; auto rc = - RunLayerContext("golden", 0.0f, create_view(weights), create_view(ins), - create_view(outs), create_view(tensors)); + RunLayerContext("golden", true, 0.0f, create_view(weights), + create_view(ins), create_view(outs), create_view(tensors)); auto num_outputs = rc.getNumOutputs(); -- 2.7.4