This commit updates recently added features in tensor, including add_i_partial() and ele_mul().
The newly added functions have been implemented according to the revised tensor structure.
**Changes proposed in this PR:**
- Update Float/HalfTensor class with newly added function, add_i_partial().
- Apply BLAS operations in basic arithmetic operations in Tensor.
- height-width transpose in half-precision can be SIMD accelerated.
**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: Donghyeon Jeong <dhyeon.jeong@samsung.com>
const float beta) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
- if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
- std::fpclassify(beta) == FP_ZERO) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::multiplies<float>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf * *m_buf + beta * *out_buf;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += output.getStrides()[3];
- }
- }
+ ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+ strides[3]);
};
NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
- if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::divides<float>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf / *m_buf;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += output.getStrides()[3];
- }
- }
+ ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
};
apply_broadcast(m, f, output);
return ML_ERROR_NONE;
}
+int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+ Tensor &m, unsigned int incX, unsigned int incY,
+ const Tensor alphas, unsigned int alpha_idx) {
+ saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
+ (float *)getAddress(addr_idx), incY);
+
+ return ML_ERROR_NONE;
+}
+
Tensor &FloatTensor::add(float const &value, Tensor &output) const {
auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
apply(f, output);
float const alpha) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
- if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
- std::fpclassify(alpha) == FP_ZERO) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::plus<float>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf + *m_buf * alpha;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += strides[3];
- }
- }
+ ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+ strides[3]);
};
apply_broadcast(m, f, output);
return output;
*/
int add_i(Tensor const &m, Tensor &output, float const alpha) override;
+ /**
+ * @copydoc Tensor::add_i_partial()
+ */
+ int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+ unsigned int incX, unsigned int incY, const Tensor alphas,
+ unsigned int alpha_idx) override;
+
/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/
const float beta) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
- if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
- std::fpclassify(beta) == FP_ZERO) {
- ele_mul(e.buffer_size, buf, m_buf, out_buf);
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += output.getStrides()[3];
- }
- }
+ ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+ strides[3]);
};
NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
return ML_ERROR_NONE;
}
+int HalfTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+ Tensor &m, unsigned int incX, unsigned int incY,
+ const Tensor alphas, unsigned int alpha_idx) {
+ saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
+ (_FP16 *)getAddress(addr_idx), incY);
+
+ return ML_ERROR_NONE;
+}
+
Tensor &HalfTensor::add(float const &value, Tensor &output) const {
auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
static_cast<_FP16>(value));
float const alpha) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
- if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) {
- ele_add(e.buffer_size, buf, m_buf, out_buf);
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha);
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += strides[3];
- }
- }
+ ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+ strides[3]);
};
apply_broadcast(m, f, output);
return output;
Tensor &HalfTensor::divide(Tensor const &m, Tensor &output) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
- if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::divides<_FP16>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf / *m_buf;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += output.getStrides()[3];
- }
- }
+ ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
};
apply_broadcast(m, f, output);
}
} else {
if (is_format_nchw) {
- transposeloop(l, i, k, j, SL, SI, SK, SJ);
+ for (unsigned int b = 0; b < batch(); ++b) {
+ for (unsigned int c = 0; c < channel(); ++c) {
+ transpose_matrix(
+ height(), width(), (_FP16 *)getData() + getIndex(b, c, 0, 0),
+ width(), (_FP16 *)output.getData() + output.getIndex(b, c, 0, 0),
+ output.width());
+ }
+ }
} else {
transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
}
*/
int add_i(Tensor const &m, Tensor &output, float const alpha) override;
+ /**
+ * @copydoc Tensor::add_i_partial()
+ */
+ int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+ unsigned int incX, unsigned int incY, const Tensor alphas,
+ unsigned int alpha_idx) override;
+
/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/
Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
const float beta) const {
+ NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
+ << "Tensor Format of " << getName() << ":"
+ << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
+ << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
+
+ NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+ !output.getContiguous(),
+ std::invalid_argument)
+ << getName() << " is not contiguous, cannot multiply";
+
+ NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+ !output.getContiguous(),
+ std::invalid_argument)
+ << getName() << " is not contiguous, cannot multiply";
itensor->multiply(m, output, beta);
return output;
}
int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY,
const Tensor alphas, unsigned int alpha_idx) {
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
- saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
- getAddress<float>(addr_idx), incY);
- } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
- getAddress<_FP16>(addr_idx), incY);
-#else
- ml_loge("%s", "Error: enable-fp16 is not enabled");
- return ML_ERROR_INVALID_PARAMETER;
-#endif
- }
- return ML_ERROR_NONE;
+ return itensor->add_i_partial(len, addr_idx, m, incX, incY, alphas,
+ alpha_idx);
}
Tensor Tensor::add(Tensor const &m, float const alpha) const {
#include <cstddef>
+#include <blas_interface.h>
#include <nntrainer_log.h>
#include <tensor_base.h>
*/
int add_i(Tensor const &m, float const alpha = 1.F);
-/**
- * @brief Do add_i for specific section
- *
- * @param len Length of the specific section
- * @param addr_idx Starting index of the psecific section
- * @param m Input Tensor to be added
- * @param incX Incremental index of X
- * @param incY Incremental index of Y
- * @param alphas Vector of multiple alpha values
- * @param alpha_idx Index of alpha in alpha vector
- * @retval #ML_ERROR_NONE Successful
- * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
- */
+ /**
+ * @brief Do add_i for specific section
+ *
+ * @param len Length of the specific section
+ * @param addr_idx Starting index of the psecific section
+ * @param m Input Tensor to be added
+ * @param incX Incremental index of X
+ * @param incY Incremental index of Y
+ * @param alphas Vector of multiple alpha values
+ * @param alpha_idx Index of alpha in alpha vector
+ * @retval #ML_ERROR_NONE Successful
+ * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+ */
int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY, const Tensor alphas,
unsigned int alpha_idx);
*/
virtual int add_i(Tensor const &m, Tensor &output, float const alpha) = 0;
+ /**
+ * @copydoc Tensor::add_i_partial()
+ */
+ virtual int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+ unsigned int incX, unsigned int incY,
+ const Tensor alphas, unsigned int alpha_idx) = 0;
+
/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/