From 3dba61670e7a0f4563853669f4d3787c25821a7a Mon Sep 17 00:00:00 2001 From: Donghyeon Jeong Date: Tue, 9 Jul 2024 20:57:46 +0900 Subject: [PATCH] [Tensor] Update newly added features This commit updates recently added features in tensor, including add_i_partial() and ele_mul(). The newly added functions have been implemented according to the revised tensor structure. **Changes proposed in this PR:** - Update Float/HalfTensor class with newly added function, add_i_partial(). - Apply BLAS operations in basic arithmetic operations in Tensor. - height-width transpose in half-precision can be SIMD accelerated. **Self-evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Donghyeon Jeong --- nntrainer/tensor/float_tensor.cpp | 49 ++++++++------------------- nntrainer/tensor/float_tensor.h | 7 ++++ nntrainer/tensor/half_tensor.cpp | 55 +++++++++++++------------------ nntrainer/tensor/half_tensor.h | 7 ++++ nntrainer/tensor/tensor.cpp | 29 ++++++++-------- nntrainer/tensor/tensor.h | 27 +++++++-------- nntrainer/tensor/tensor_base.h | 7 ++++ 7 files changed, 87 insertions(+), 94 deletions(-) diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp index 2652610d..5ac40ca7 100644 --- a/nntrainer/tensor/float_tensor.cpp +++ b/nntrainer/tensor/float_tensor.cpp @@ -396,18 +396,8 @@ Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output, const float beta) const { auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf, float *out_buf) { - if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 && - std::fpclassify(beta) == FP_ZERO) { - std::transform(buf, buf + e.buffer_size, m_buf, out_buf, - std::multiplies()); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf * *m_buf + beta * *out_buf; - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += output.getStrides()[3]; - } - } + ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3], + strides[3]); }; NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument) @@ -436,17 +426,7 @@ Tensor &FloatTensor::divide(float const &value, Tensor &output) const { Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const { auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf, float *out_buf) { - if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) { - std::transform(buf, buf + e.buffer_size, m_buf, out_buf, - std::divides()); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf / *m_buf; - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += output.getStrides()[3]; - } - } + ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]); }; apply_broadcast(m, f, output); @@ -522,6 +502,15 @@ int FloatTensor::add_i(Tensor const &m, Tensor &output, float const alpha) { return ML_ERROR_NONE; } +int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx, + Tensor &m, unsigned int incX, unsigned int incY, + const Tensor alphas, unsigned int alpha_idx) { + saxpy(len, alphas.getValue(alpha_idx), m.getData(), incX, + (float *)getAddress(addr_idx), incY); + + return ML_ERROR_NONE; +} + Tensor &FloatTensor::add(float const &value, Tensor &output) const { auto f = std::bind(std::plus(), std::placeholders::_1, value); apply(f, output); @@ -532,18 +521,8 @@ Tensor &FloatTensor::add(Tensor const &m, Tensor &output, float const alpha) const { auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf, float *out_buf) { - if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && - std::fpclassify(alpha) == FP_ZERO) { - std::transform(buf, buf + e.buffer_size, m_buf, out_buf, - std::plus()); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf + *m_buf * alpha; - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += strides[3]; - } - } + ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3], + strides[3]); }; apply_broadcast(m, f, output); return output; diff --git a/nntrainer/tensor/float_tensor.h b/nntrainer/tensor/float_tensor.h index 5463e9f1..1adfebb2 100644 --- a/nntrainer/tensor/float_tensor.h +++ b/nntrainer/tensor/float_tensor.h @@ -267,6 +267,13 @@ public: */ int add_i(Tensor const &m, Tensor &output, float const alpha) override; + /** + * @copydoc Tensor::add_i_partial() + */ + int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m, + unsigned int incX, unsigned int incY, const Tensor alphas, + unsigned int alpha_idx) override; + /** * @copydoc Tensor::add(float const &value, Tensor &output) */ diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp index 2f66f1c0..26ac4e85 100644 --- a/nntrainer/tensor/half_tensor.cpp +++ b/nntrainer/tensor/half_tensor.cpp @@ -395,17 +395,8 @@ Tensor &HalfTensor::multiply(Tensor const &m, Tensor &output, const float beta) const { auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { - if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 && - std::fpclassify(beta) == FP_ZERO) { - ele_mul(e.buffer_size, buf, m_buf, out_buf); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf; - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += output.getStrides()[3]; - } - } + ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3], + strides[3]); }; NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument) @@ -495,6 +486,15 @@ int HalfTensor::add_i(Tensor const &m, Tensor &output, float const alpha) { return ML_ERROR_NONE; } +int HalfTensor::add_i_partial(unsigned int len, unsigned int addr_idx, + Tensor &m, unsigned int incX, unsigned int incY, + const Tensor alphas, unsigned int alpha_idx) { + saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX, + (_FP16 *)getAddress(addr_idx), incY); + + return ML_ERROR_NONE; +} + Tensor &HalfTensor::add(float const &value, Tensor &output) const { auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1, static_cast<_FP16>(value)); @@ -506,16 +506,8 @@ Tensor &HalfTensor::add(Tensor const &m, Tensor &output, float const alpha) const { auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { - if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) { - ele_add(e.buffer_size, buf, m_buf, out_buf); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha); - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += strides[3]; - } - } + ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3], + strides[3]); }; apply_broadcast(m, f, output); return output; @@ -1035,17 +1027,7 @@ Tensor &HalfTensor::divide(float const &value, Tensor &output) const { Tensor &HalfTensor::divide(Tensor const &m, Tensor &output) const { auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf, _FP16 *out_buf) { - if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) { - std::transform(buf, buf + e.buffer_size, m_buf, out_buf, - std::divides<_FP16>()); - } else { - for (unsigned int i = 0; i < e.buffer_size; ++i) { - *out_buf = *buf / *m_buf; - buf += strides[3]; - m_buf += e.strides[3]; - out_buf += output.getStrides()[3]; - } - } + ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]); }; apply_broadcast(m, f, output); @@ -1136,7 +1118,14 @@ Tensor &HalfTensor::transpose(const std::string &direction, } } else { if (is_format_nchw) { - transposeloop(l, i, k, j, SL, SI, SK, SJ); + for (unsigned int b = 0; b < batch(); ++b) { + for (unsigned int c = 0; c < channel(); ++c) { + transpose_matrix( + height(), width(), (_FP16 *)getData() + getIndex(b, c, 0, 0), + width(), (_FP16 *)output.getData() + output.getIndex(b, c, 0, 0), + output.width()); + } + } } else { transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI); } diff --git a/nntrainer/tensor/half_tensor.h b/nntrainer/tensor/half_tensor.h index 6ca35e4f..43988207 100644 --- a/nntrainer/tensor/half_tensor.h +++ b/nntrainer/tensor/half_tensor.h @@ -267,6 +267,13 @@ public: */ int add_i(Tensor const &m, Tensor &output, float const alpha) override; + /** + * @copydoc Tensor::add_i_partial() + */ + int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m, + unsigned int incX, unsigned int incY, const Tensor alphas, + unsigned int alpha_idx) override; + /** * @copydoc Tensor::add(float const &value, Tensor &output) */ diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 28bc271f..2c815e29 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -251,6 +251,20 @@ Tensor Tensor::multiply(Tensor const &m, const float beta) const { Tensor &Tensor::multiply(Tensor const &m, Tensor &output, const float beta) const { + NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument) + << "Tensor Format of " << getName() << ":" + << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. (" + << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")"; + + NNTR_THROW_IF(!getContiguous() || !m.getContiguous() || + !output.getContiguous(), + std::invalid_argument) + << getName() << " is not contiguous, cannot multiply"; + + NNTR_THROW_IF(!getContiguous() || !m.getContiguous() || + !output.getContiguous(), + std::invalid_argument) + << getName() << " is not contiguous, cannot multiply"; itensor->multiply(m, output, beta); return output; } @@ -355,19 +369,8 @@ int Tensor::add_i(Tensor const &m, float const alpha) { int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m, unsigned int incX, unsigned int incY, const Tensor alphas, unsigned int alpha_idx) { - if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) { - saxpy(len, alphas.getValue(alpha_idx), m.getData(), incX, - getAddress(addr_idx), incY); - } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX, - getAddress<_FP16>(addr_idx), incY); -#else - ml_loge("%s", "Error: enable-fp16 is not enabled"); - return ML_ERROR_INVALID_PARAMETER; -#endif - } - return ML_ERROR_NONE; + return itensor->add_i_partial(len, addr_idx, m, incX, incY, alphas, + alpha_idx); } Tensor Tensor::add(Tensor const &m, float const alpha) const { diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 789a4eb6..bfd98978 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -23,6 +23,7 @@ #include +#include #include #include @@ -735,19 +736,19 @@ public: */ int add_i(Tensor const &m, float const alpha = 1.F); -/** - * @brief Do add_i for specific section - * - * @param len Length of the specific section - * @param addr_idx Starting index of the psecific section - * @param m Input Tensor to be added - * @param incX Incremental index of X - * @param incY Incremental index of Y - * @param alphas Vector of multiple alpha values - * @param alpha_idx Index of alpha in alpha vector - * @retval #ML_ERROR_NONE Successful - * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter - */ + /** + * @brief Do add_i for specific section + * + * @param len Length of the specific section + * @param addr_idx Starting index of the psecific section + * @param m Input Tensor to be added + * @param incX Incremental index of X + * @param incY Incremental index of Y + * @param alphas Vector of multiple alpha values + * @param alpha_idx Index of alpha in alpha vector + * @retval #ML_ERROR_NONE Successful + * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter + */ int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m, unsigned int incX, unsigned int incY, const Tensor alphas, unsigned int alpha_idx); diff --git a/nntrainer/tensor/tensor_base.h b/nntrainer/tensor/tensor_base.h index 945b82b3..1831de0a 100644 --- a/nntrainer/tensor/tensor_base.h +++ b/nntrainer/tensor/tensor_base.h @@ -283,6 +283,13 @@ public: */ virtual int add_i(Tensor const &m, Tensor &output, float const alpha) = 0; + /** + * @copydoc Tensor::add_i_partial() + */ + virtual int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m, + unsigned int incX, unsigned int incY, + const Tensor alphas, unsigned int alpha_idx) = 0; + /** * @copydoc Tensor::add(float const &value, Tensor &output) */ -- 2.34.1