From: hyeonseok lee Date: Mon, 22 Nov 2021 06:46:17 +0000 (+0900) Subject: [grucell] Implement grucell X-Git-Tag: accepted/tizen/unified/20220323.062643~189 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=eee292425ae3723d0d7f99a18175fd0204f92c25;p=platform%2Fcore%2Fml%2Fnntrainer.git [grucell] Implement grucell - Support add with non-contiguous tensor - Implement grucell which is run only 1 timestep - Todo: 1. Make it more efficient with strided tensor(Reduce copy tensor) 2. Reduce temporary tensor or do inplace operation Self evaluation: Build test: [X]Passed [ ]Failed [ ]Skipped Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: hyeonseok lee --- diff --git a/nntrainer/layers/grucell.cpp b/nntrainer/layers/grucell.cpp new file mode 100644 index 0000000..bb4e831 --- /dev/null +++ b/nntrainer/layers/grucell.cpp @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2020 hyeonseok lee + * + * @file grucell.cpp + * @date 28 Oct 2021 + * @brief This is Gated Recurrent Unit Cell Layer Class of Neural Network + * @see https://github.com/nnstreamer/nntrainer + * @author hyeonseok lee + * @bug No known bugs except for NYI items + * + * h_prev --------d1------->[*]-------d0----->[+]---d0--> h + * dh_nx | | | | d0 dh + * | d14 | d2 d3 | + * | | +-----[1-]------>[*] + * | [*]<---+ d15 |d5 | d6 + * | | |rt | zt |gt + * | | [sig] [sig] [tanh] + * | | |d16 | d7 |d8 + * | | [+] [+] [+] + * | | / \d16 | \ d7 / \ d8 + * | | Whhr Wxhr Whhz Wxhz Whhg Wxhg + * | | |d17 |d13 |d12 |d11 |d10 | d9 + * +- |--+------|---+ | | | + * +---------|--------|----------+ | + * xs------------------+--------+---------------+ + */ + +#include + +#include +#include +#include +#include +#include +#include + +namespace nntrainer { + +static constexpr size_t SINGLE_INOUT_IDX = 0; + +enum GRUCellParams { + weight_xh, + weight_hh, + bias_h, + hidden_state, + zrg, + dropout_mask +}; + +#define ENABLE_BIAS_IH 1 +// Todo: enable bias_hh +#define ENABLE_BIAS_HH 0 + +// Todo: handle with strided tensor more efficiently and reduce temporary +// tensors +GRUCellLayer::GRUCellLayer() : + LayerImpl(), + grucell_props(props::Unit(), props::HiddenStateActivation(), + props::RecurrentActivation(), props::DropOutRate(), + props::MaxTimestep(), props::Timestep()), + wt_idx({0}), + acti_func(ActivationType::ACT_NONE, true), + recurrent_acti_func(ActivationType::ACT_NONE, true), + epsilon(1e-3) {} + +// - weight_xh ( input to hidden ) +// : [1, 1, input_size, unit (hidden_size) x NUM_GATE] -> z, r, g +// - weight_hh ( hidden to hidden ) +// : [1, 1, unit (hidden_size) , unit (hidden_size) x NUM_GATE] -> z, r, g +// - bias_h ( hidden bias ) +// : [1, 1, 1, unit (hidden_size) x NUM_GATE] -> z, r, g +void GRUCellLayer::finalize(InitLayerContext &context) { + auto &weight_regularizer = + std::get(*layer_impl_props); + auto &weight_regularizer_constant = + std::get(*layer_impl_props); + auto &weight_initializer = + std::get(*layer_impl_props); + auto &bias_initializer = std::get(*layer_impl_props); + + const unsigned int unit = std::get(grucell_props).get(); + auto &hidden_state_activation_type = + std::get(grucell_props); + auto &recurrent_activation_type = + std::get(grucell_props); + const float dropout_rate = std::get(grucell_props); + const unsigned int max_timestep = std::get(grucell_props); + + if (context.getNumInputs() != 1) { + throw std::invalid_argument("GRUCell layer takes only one input"); + } + + // input_dim = [ batch, 1, 1, feature_size ] + const TensorDim &input_dim = context.getInputDimensions()[0]; + if (input_dim.height() != 1 && input_dim.channel() != 1) { + throw std::invalid_argument( + "Input must be single time dimension for GRUCell"); + } + + const unsigned int batch_size = input_dim.batch(); + const unsigned int feature_size = input_dim.width(); + + // output_dim = [ batch, 1, 1, hidden_size (unit)] + TensorDim output_dim(batch_size, 1, 1, unit); + context.setOutputDimensions({output_dim}); + + if (dropout_rate > epsilon) { + wt_idx[GRUCellParams::dropout_mask] = context.requestTensor( + output_dim, "dropout_mask", Tensor::Initializer::NONE, false, + TensorLifespan::ITERATION_LIFESPAN); + } + + TensorDim weight_xh_dim({feature_size, NUM_GATE * unit}); + TensorDim weight_hh_dim({unit, NUM_GATE * unit}); + TensorDim bias_dim({NUM_GATE * unit}); + + // weight_initializer can be set seperately. weight_xh initializer, + // weight_hh initializer kernel initializer & recurrent_initializer in keras + // for now, it is set same way. + wt_idx[GRUCellParams::weight_xh] = + context.requestWeight(weight_xh_dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, "weight_xh", true); + wt_idx[GRUCellParams::weight_hh] = + context.requestWeight(weight_hh_dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, "weight_hh", true); + wt_idx[GRUCellParams::bias_h] = context.requestWeight( + bias_dim, bias_initializer, WeightRegularizer::NONE, 1.0f, "bias_h", true); + + TensorDim hidden_state_dim(max_timestep * batch_size, 1, 1, unit); + wt_idx[GRUCellParams::hidden_state] = context.requestTensor( + hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true, + TensorLifespan::ITERATION_LIFESPAN); + + TensorDim zrg_dim(max_timestep * batch_size, 1, 1, unit * NUM_GATE); + wt_idx[GRUCellParams::zrg] = + context.requestTensor(zrg_dim, "zrg", Tensor::Initializer::NONE, true, + TensorLifespan::ITERATION_LIFESPAN); + + if (hidden_state_activation_type.get() == ActivationType::ACT_NONE) { + hidden_state_activation_type.set(ActivationType::ACT_TANH); + } + acti_func.setActiFunc(hidden_state_activation_type.get()); + + if (recurrent_activation_type.get() == ActivationType::ACT_NONE) { + recurrent_activation_type.set(ActivationType::ACT_SIGMOID); + } + recurrent_acti_func.setActiFunc(recurrent_activation_type.get()); +} + +void GRUCellLayer::setProperty(const std::vector &values) { + auto remain_props = loadProperties(values, grucell_props); + LayerImpl::setProperty(remain_props); +} + +void GRUCellLayer::exportTo(Exporter &exporter, + const ExportMethods &method) const { + LayerImpl::exportTo(exporter, method); + exporter.saveResult(grucell_props, method, this); +} + +void GRUCellLayer::forwarding(RunLayerContext &context, bool training) { + const unsigned int unit = std::get(grucell_props).get(); + const float dropout_rate = std::get(grucell_props); + const unsigned int max_timestep = std::get(grucell_props); + const unsigned int timestep = std::get(grucell_props); + + Tensor &weight_xh = context.getWeight(wt_idx[GRUCellParams::weight_xh]); + Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]); + Tensor &bias_ih = context.getWeight(wt_idx[GRUCellParams::bias_h]); + + Tensor &input = context.getInput(SINGLE_INOUT_IDX); + Tensor &hidden_states = + context.getTensor(wt_idx[GRUCellParams::hidden_state]); + Tensor &zrg_gates = context.getTensor(wt_idx[GRUCellParams::zrg]); + Tensor prev_hidden_state; + + const TensorDim &input_dim = input.getDim(); + const unsigned int batch_size = input_dim.batch(); + + hidden_states.reshape({max_timestep, 1, batch_size, unit}); + zrg_gates.reshape({max_timestep, 1, batch_size, NUM_GATE * unit}); + + Tensor hidden_state = hidden_states.getBatchSlice(timestep, 1); + if (!timestep) { + prev_hidden_state = Tensor(batch_size, unit); + prev_hidden_state.setZero(); + } else { + prev_hidden_state = hidden_states.getBatchSlice(timestep - 1, 1); + } + Tensor zrg_gate = zrg_gates.getBatchSlice(timestep, 1); + + input.dot(weight_xh, zrg_gate); // x_z, x_r, x_g + + Tensor zr_gate = + zrg_gate.getSharedDataTensor({batch_size, 2 * unit}, 0, false); + Tensor g_gate = + zrg_gate.getSharedDataTensor({batch_size, unit}, 2 * unit, false); + + Tensor weight_hh_zr; + Tensor weight_hh_g; + weight_hh_zr.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false)); + weight_hh_g.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false)); + + if (timestep) { + zr_gate.add_i_strided(prev_hidden_state.dot(weight_hh_zr)); + } + Tensor bias_ih_zr = bias_ih.getSharedDataTensor({2 * unit}, 0); + zr_gate.add_i(bias_ih_zr); + + recurrent_acti_func.run_fn(zr_gate, zr_gate); + + Tensor z_gate = zr_gate.getSharedDataTensor({batch_size, unit}, 0, false); + Tensor r_gate = zr_gate.getSharedDataTensor({batch_size, unit}, unit, false); + + Tensor temp; + prev_hidden_state.dot(weight_hh_g, temp, false, false); +#if ENABLE_BIAS_HH + // Todo: fix this to get the bias_hh_g from bias_hh + Tensor bias_hh_g = bias_ih.getSharedDataTensor({unit}, 2 * unit); + temp.add_i(bias_hh_g); +#endif + temp.multiply_i_strided(r_gate); + g_gate.add_i_strided(temp); +#if ENABLE_BIAS_IH + Tensor bias_ih_g = bias_ih.getSharedDataTensor({unit}, 2 * unit); + g_gate.add_i(bias_ih_g); +#endif + + acti_func.run_fn(g_gate, g_gate); + + z_gate.multiply_strided(prev_hidden_state, hidden_state); + temp = z_gate.multiply(-1.0).add(1.0); + hidden_state.add_i(g_gate.multiply_strided(temp)); + + if (dropout_rate > epsilon && training) { + Tensor mask = context.getTensor(wt_idx[GRUCellParams::dropout_mask]); + mask.dropout_mask(dropout_rate); + hidden_state.multiply_i(mask); + } + + Tensor &output = context.getOutput(SINGLE_INOUT_IDX); + output.copy(hidden_state); +} + +void GRUCellLayer::calcDerivative(RunLayerContext &context) { + const unsigned int unit = std::get(grucell_props).get(); + const unsigned int max_timestep = std::get(grucell_props); + const unsigned int timestep = std::get(grucell_props); + const TensorDim &input_dim = context.getInput(SINGLE_INOUT_IDX).getDim(); + const unsigned int batch_size = input_dim.batch(); + + Tensor &zrg_gates_derivatives = + context.getTensorGrad(wt_idx[GRUCellParams::zrg]); + zrg_gates_derivatives.reshape({max_timestep, 1, batch_size, NUM_GATE * unit}); + Tensor zrg_gate_derivative = zrg_gates_derivatives.getBatchSlice(timestep, 1); + Tensor &weight_xh = context.getWeight(wt_idx[GRUCellParams::weight_xh]); + Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX); + + zrg_gate_derivative.dot(weight_xh, outgoing_derivative, false, true); +} + +void GRUCellLayer::calcGradient(RunLayerContext &context) { + const unsigned int unit = std::get(grucell_props).get(); + const float dropout_rate = std::get(grucell_props); + const unsigned int max_timestep = std::get(grucell_props); + const unsigned int timestep = std::get(grucell_props); + + Tensor &input = context.getInput(SINGLE_INOUT_IDX); + const TensorDim &input_dim = input.getDim(); + const unsigned int batch_size = input_dim.batch(); + + Tensor &djdweight_xh = + context.getWeightGrad(wt_idx[GRUCellParams::weight_xh]); + Tensor &djdweight_hh = + context.getWeightGrad(wt_idx[GRUCellParams::weight_hh]); + Tensor &djdbias_ih = context.getWeightGrad(wt_idx[GRUCellParams::bias_h]); + Tensor &weight_hh = context.getWeight(wt_idx[GRUCellParams::weight_hh]); + Tensor &bias_h = context.getWeight(wt_idx[GRUCellParams::bias_h]); + Tensor bias_h_g = bias_h.getSharedDataTensor({unit}, 2 * unit); + + Tensor djdw_zr_h = + djdweight_hh.getSharedDataTensor({unit, 2 * unit}, 0, false); + Tensor djdw_g_h = + djdweight_hh.getSharedDataTensor({unit, unit}, 2 * unit, false); + Tensor &hidden_states = + context.getTensor(wt_idx[GRUCellParams::hidden_state]); + hidden_states.reshape({max_timestep, 1, batch_size, unit}); + Tensor hidden_state = hidden_states.getBatchSlice(timestep, 1); + Tensor &hidden_states_derivatives = + context.getTensorGrad(wt_idx[GRUCellParams::hidden_state]); + Tensor &incoming_derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX); + Tensor &zrg_gates = context.getTensor(wt_idx[GRUCellParams::zrg]); + zrg_gates.reshape({max_timestep, 1, batch_size, NUM_GATE * unit}); + Tensor zrg_gate = zrg_gates.getBatchSlice(timestep, 1); + Tensor &zrg_gates_derivatives = + context.getTensorGrad(wt_idx[GRUCellParams::zrg]); + zrg_gates_derivatives.reshape({max_timestep, 1, batch_size, NUM_GATE * unit}); + Tensor zrg_gate_derivative = zrg_gates_derivatives.getBatchSlice(timestep, 1); + + hidden_states_derivatives.reshape({max_timestep, 1, batch_size, unit}); + Tensor hidden_state_derivative = + hidden_states_derivatives.getBatchSlice(timestep, 1); + hidden_state_derivative.reshape(incoming_derivative.getDim()); + if (timestep + 1 == max_timestep) { + djdweight_xh.setZero(); + djdweight_hh.setZero(); + djdbias_ih.setZero(); + hidden_state_derivative.copyData(incoming_derivative); + } else { + hidden_state_derivative.add_i(incoming_derivative); + } + // restore the dimension + hidden_state_derivative.reshape({1, 1, batch_size, unit}); + + Tensor hs_prev; + Tensor dh_nx; + if (timestep) { + hs_prev = hidden_states.getBatchSlice(timestep - 1, 1); + dh_nx = hidden_states_derivatives.getBatchSlice(timestep - 1, 1); + } else { + dh_nx = Tensor(batch_size, unit); + hs_prev = Tensor(batch_size, unit); + hs_prev.setZero(); + } + + if (dropout_rate > epsilon) { + hidden_states_derivatives.multiply_i( + context.getTensor(wt_idx[GRUCellParams::dropout_mask])); + } + + Tensor dhz = + zrg_gate_derivative.getSharedDataTensor({batch_size, unit}, 0, false); + Tensor dhr = + zrg_gate_derivative.getSharedDataTensor({batch_size, unit}, unit, false); + Tensor dhg = zrg_gate_derivative.getSharedDataTensor({batch_size, unit}, + unit * 2, false); + + Tensor zt = zrg_gate.getSharedDataTensor({batch_size, unit}, 0, false); + Tensor rt = zrg_gate.getSharedDataTensor({batch_size, unit}, unit, false); + Tensor gt = zrg_gate.getSharedDataTensor({batch_size, unit}, unit * 2, false); + + hidden_state_derivative.multiply_strided(zt, dh_nx); // dh_nx = d1 + hidden_state_derivative.multiply_strided(hs_prev, dhz); // dhz = d2 + dhz.add_i_strided(hidden_state_derivative.multiply_strided(gt), + -1.0f); // dhz = d5 + zt.multiply(-1.0, dhg); + dhg.add_i(1.0); + dhg.multiply_i_strided(hidden_state_derivative); // dhg = d6 + + recurrent_acti_func.run_prime_fn(zt, dhz, dhz); // dhz = d7 + acti_func.run_prime_fn(gt, dhg, dhg); // dhg = d8 + + Tensor dhzr = zrg_gate_derivative.getSharedDataTensor({batch_size, unit * 2}, + 0, false); // dhz+dhr + + Tensor wg_hh; + wg_hh.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit}, unit * 2, false)); + Tensor wzr_hh; + wzr_hh.copy_with_stride( + weight_hh.getSharedDataTensor({1, 1, unit, unit * 2}, 0, false)); + + Tensor temp = Tensor(batch_size, unit); + Tensor dhg_; + dhg_.copy_with_stride(dhg); + hs_prev.dot(wg_hh, temp); +#if ENABLE_BIAS_HH + temp.add_i(bias_h_g); +#endif + dhg_.multiply_strided(temp, dhr); // dhr = d15 + + // reset temp : hs_prev * rt for djdbias_hh_g and dh_nx + dhg_.multiply_strided(rt, temp); +#if ENABLE_BIAS_HH + // Todo: fix this to get the djdbias_hh_g from djdbias_hh + Tensor djdbias_hh_g = djdbias_ih.getSharedDataTensor({unit}, 2 * unit); + temp.sum(2, djdbias_hh_g, 1.0, 1.0); +#endif + temp.dot(wg_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + + recurrent_acti_func.run_prime_fn(rt, dhr, dhr); // dhr = d16 + +#if ENABLE_BIAS_HH + // Todo: fix this to get the djdbias_hh_zr from djdbias_hh + Tensor djdbias_hh_zr = djdbias_ih.getSharedDataTensor({2 * unit}, 0); + djdbias_hh_zr.add_i( + zrg_gate_derivative.sum(2).getSharedDataTensor({2 * unit}, 0)); +#endif +#if ENABLE_BIAS_IH + zrg_gate_derivative.sum(2, djdbias_ih, 1.0, 1.0); +#endif + + djdweight_xh.add_i(input.dot(zrg_gate_derivative, true, false)); + + Tensor dhzr_; + dhzr_.copy_with_stride(dhzr); + djdw_zr_h.add_i_strided(hs_prev.dot(dhzr_, true, false)); + djdw_g_h.add_i_strided(hs_prev.dot(temp, true, false)); + dhzr_.dot(wzr_hh, dh_nx, false, true, 1.0); // dh_nx = d1 + d14 + d12 + d17 +} + +void GRUCellLayer::setBatch(RunLayerContext &context, unsigned int batch) { + unsigned int &max_timestep = std::get(grucell_props); + context.updateTensor(wt_idx[GRUCellParams::hidden_state], + max_timestep * batch); + context.updateTensor(wt_idx[GRUCellParams::zrg], max_timestep * batch); + context.updateTensor(wt_idx[GRUCellParams::dropout_mask], batch); +} + +} // namespace nntrainer diff --git a/nntrainer/layers/grucell.h b/nntrainer/layers/grucell.h new file mode 100644 index 0000000..ef3ca3c --- /dev/null +++ b/nntrainer/layers/grucell.h @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2021 hyeonseok lee + * + * @file grucell.h + * @date 28 Oct 2021 + * @brief This is Gated Recurrent Unit Cell Layer Class of Neural Network + * @see https://github.com/nnstreamer/nntrainer + * @author hyeonseok lee + * @bug No known bugs except for NYI items + * + */ + +#ifndef __GRUCELL_H__ +#define __GRUCELL_H__ +#ifdef __cplusplus + +#include +#include +#include + +namespace nntrainer { + +/** + * @class GRUCellLayer + * @brief GRUCellLayer + */ +class GRUCellLayer : public LayerImpl { +public: + /** + * @brief Constructor of GRUCellLayer + */ + GRUCellLayer(); + + /** + * @brief Destructor of GRUCellLayer + */ + ~GRUCellLayer() = default; + + /** + * @brief Move constructor. + * @param[in] GRUCellLayer && + */ + GRUCellLayer(GRUCellLayer &&rhs) noexcept = default; + + /** + * @brief Move assignment operator. + * @parma[in] rhs GRUCellLayer to be moved. + */ + GRUCellLayer &operator=(GRUCellLayer &&rhs) = default; + + /** + * @copydoc Layer::finalize(InitLayerContext &context) + */ + void finalize(InitLayerContext &context) override; + + /** + * @copydoc Layer::forwarding(RunLayerContext &context, bool training) + */ + void forwarding(RunLayerContext &context, bool training) override; + + /** + * @copydoc Layer::calcDerivative(RunLayerContext &context) + */ + void calcDerivative(RunLayerContext &context) override; + + /** + * @copydoc Layer::calcGradient(RunLayerContext &context) + */ + void calcGradient(RunLayerContext &context) override; + + /** + * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method) + */ + void exportTo(Exporter &exporter, const ExportMethods &method) const override; + + /** + * @copydoc Layer::getType() + */ + const std::string getType() const override { return GRUCellLayer::type; }; + + /** + * @copydoc Layer::supportBackwarding() + */ + bool supportBackwarding() const override { return true; } + + /** + * @copydoc Layer::setProperty(const PropertyType type, const std::string + * &value) + */ + void setProperty(const std::vector &values) override; + + /** + * @copydoc Layer::setBatch(RunLayerContext &context, unsigned int batch) + */ + void setBatch(RunLayerContext &context, unsigned int batch) override; + + inline static const std::string type = "grucell"; + +private: + static constexpr unsigned int NUM_GATE = 3; + + /** + * Unit: number of output neurons + * HiddenStateActivation: activation type for hidden state. default is tanh + * RecurrentActivation: activation type for recurrent. default is sigmoid + * DropOutRate: dropout rate + * MaxTimeStep: Maximum timestep of gru + * TimeStep: timestep for which gru should operate + * + * */ + std::tuple + grucell_props; + std::array wt_idx; /**< indices of the weights */ + + /** + * @brief activation function for h_t : default is sigmoid + */ + ActiFunc acti_func; + + /** + * @brief activation function for recurrent : default is tanh + */ + ActiFunc recurrent_acti_func; + + /** + * @brief to protect overflow + */ + float epsilon; +}; +} // namespace nntrainer + +#endif /* __cplusplus */ +#endif /* __GRUCELL_H__ */ diff --git a/nntrainer/layers/meson.build b/nntrainer/layers/meson.build index 1cf5dc9..f389be0 100644 --- a/nntrainer/layers/meson.build +++ b/nntrainer/layers/meson.build @@ -31,6 +31,7 @@ layer_sources = [ 'permute_layer.cpp', 'layer_impl.cpp', 'gru.cpp', + 'grucell.cpp', 'dropout.cpp', 'centroid_knn.cpp', 'layer_context.cpp', diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 812a081..8ae68a8 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -387,6 +387,61 @@ Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output, return output; } +int Tensor::add_i_strided(Tensor const &m, const float beta) { + try { + this->add_strided(m, *this, beta); + } catch (std::exception &err) { + ml_loge("%s %s", typeid(err).name(), err.what()); + return ML_ERROR_INVALID_PARAMETER; + } + + return ML_ERROR_NONE; +} + +Tensor Tensor::add_strided(Tensor const &m, const float beta) const { + Tensor t; + return this->add_strided(m, t, beta); +} + +Tensor &Tensor::add_strided(Tensor const &m, Tensor &output, + const float beta) const { + /** TODO: throw than create new dimenions */ + CREATE_IF_EMPTY_DIMS(output, dim); + + if (size() != m.size() || size() != output.size()) + throw std::invalid_argument( + "Strided addition does not support broadcasting"); + + if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 || + beta != 0.0) { + for (unsigned int b = 0; b < batch(); ++b) { + for (unsigned int c = 0; c < channel(); ++c) { + for (unsigned int h = 0; h < height(); ++h) { + for (unsigned int w = 0; w < width(); ++w) { + output.setValue( + b, c, h, w, getValue(b, c, h, w) + m.getValue(b, c, h, w) * beta); + } + } + } + } + } else { + /** @todo optimize this with combining these loops where stride is 1 */ + for (unsigned int b = 0; b < batch(); ++b) { + for (unsigned int c = 0; c < channel(); ++c) { + for (unsigned int h = 0; h < height(); ++h) { + float *out_data = output.getAddress(b, c, h, 0); + const float *m_data = m.getAddress(b, c, h, 0); + const float *in_data = getAddress(b, c, h, 0); + std::transform(in_data, in_data + width(), m_data, out_data, + std::plus()); + } + } + } + } + + return output; +} + int Tensor::multiply_i(float const &value) { NNTR_THROW_IF(!contiguous, std::invalid_argument) << getName() << " is not contiguous, cannot multiply"; diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index d170b18..198ac4d 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -391,6 +391,47 @@ public: const float beta = 0.0) const; /** + * @brief Add Tensor Elementwise + * @param[in] m Tensor to be added + * @param[in] beta scalar to add output with and add + * @retval #ML_ERROR_NONE successful + * + * @note support different strided inputs and output + * @note does not support broadcasting + * + * @todo merge this to add_i + */ + int add_i_strided(Tensor const &m, const float beta = 0.0); + + /** + * @brief Add Tensor Element by Element + * @param[in] m Tensor to be added + * @param[in] beta Value to be scale the added tensor + * @retval Calculated Tensor + * + * @note support different strided inputs and output + * @note does not support broadcasting + * + * @todo merge this to add + */ + Tensor add_strided(Tensor const &m, const float beta = 0.0) const; + + /** + * @brief Add Tensor Element by Element + * @param[in] m Tensor to be added + * @param[out] output Tensor to store the result + * @param[in] beta Value to be scale the added tensor + * @retval Calculated Tensor + * + * @note support different strided inputs and output + * @note does not support broadcasting + * + * @todo merge this to add + */ + Tensor &add_strided(Tensor const &m, Tensor &output, + const float beta = 0.0) const; + + /** * @brief Divide value element by element immediately * @param[in] value divisor * @retval #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right