From 3dba61670e7a0f4563853669f4d3787c25821a7a Mon Sep 17 00:00:00 2001
From: Donghyeon Jeong <dhyeon.jeong@samsung.com>
Date: Tue, 9 Jul 2024 20:57:46 +0900
Subject: [PATCH] [Tensor] Update newly added features

This commit updates recently added features in tensor, including add_i_partial() and ele_mul().
The newly added functions have been implemented according to the revised tensor structure.

**Changes proposed in this PR:**
- Update Float/HalfTensor class with newly added function, add_i_partial().
- Apply BLAS operations in basic arithmetic operations in Tensor.
- height-width transpose in half-precision can be SIMD accelerated.

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <dhyeon.jeong@samsung.com>
---
 nntrainer/tensor/float_tensor.cpp | 49 ++++++++-------------------
 nntrainer/tensor/float_tensor.h   |  7 ++++
 nntrainer/tensor/half_tensor.cpp  | 55 +++++++++++++------------------
 nntrainer/tensor/half_tensor.h    |  7 ++++
 nntrainer/tensor/tensor.cpp       | 29 ++++++++--------
 nntrainer/tensor/tensor.h         | 27 +++++++--------
 nntrainer/tensor/tensor_base.h    |  7 ++++
 7 files changed, 87 insertions(+), 94 deletions(-)
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 2652610d..5ac40ca7 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -396,18 +396,8 @@ Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
                               const float beta) const {
   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(beta) == FP_ZERO) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::multiplies<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf * *m_buf + beta * *out_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+            strides[3]);
   };
 
   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
@@ -436,17 +426,7 @@ Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
 Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::divides<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf / *m_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
   };
 
   apply_broadcast(m, f, output);
@@ -522,6 +502,15 @@ int FloatTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
   return ML_ERROR_NONE;
 }
 
+int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+                               Tensor &m, unsigned int incX, unsigned int incY,
+                               const Tensor alphas, unsigned int alpha_idx) {
+  saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
+        (float *)getAddress(addr_idx), incY);
+
+  return ML_ERROR_NONE;
+}
+
 Tensor &FloatTensor::add(float const &value, Tensor &output) const {
   auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
   apply(f, output);
@@ -532,18 +521,8 @@ Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
                          float const alpha) const {
   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(alpha) == FP_ZERO) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::plus<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf + *m_buf * alpha;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += strides[3];
-      }
-    }
+    ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+            strides[3]);
   };
   apply_broadcast(m, f, output);
   return output;
diff --git a/nntrainer/tensor/float_tensor.h b/nntrainer/tensor/float_tensor.h
index 5463e9f1..1adfebb2 100644
--- a/nntrainer/tensor/float_tensor.h
+++ b/nntrainer/tensor/float_tensor.h
@@ -267,6 +267,13 @@ public:
    */
   int add_i(Tensor const &m, Tensor &output, float const alpha) override;
 
+  /**
+   * @copydoc Tensor::add_i_partial()
+   */
+  int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                    unsigned int incX, unsigned int incY, const Tensor alphas,
+                    unsigned int alpha_idx) override;
+
   /**
    * @copydoc Tensor::add(float const &value, Tensor &output)
    */
diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp
index 2f66f1c0..26ac4e85 100644
--- a/nntrainer/tensor/half_tensor.cpp
+++ b/nntrainer/tensor/half_tensor.cpp
@@ -395,17 +395,8 @@ Tensor &HalfTensor::multiply(Tensor const &m, Tensor &output,
                              const float beta) const {
   auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(beta) == FP_ZERO) {
-      ele_mul(e.buffer_size, buf, m_buf, out_buf);
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+            strides[3]);
   };
 
   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
@@ -495,6 +486,15 @@ int HalfTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
   return ML_ERROR_NONE;
 }
 
+int HalfTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+                              Tensor &m, unsigned int incX, unsigned int incY,
+                              const Tensor alphas, unsigned int alpha_idx) {
+  saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
+        (_FP16 *)getAddress(addr_idx), incY);
+
+  return ML_ERROR_NONE;
+}
+
 Tensor &HalfTensor::add(float const &value, Tensor &output) const {
   auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
                      static_cast<_FP16>(value));
@@ -506,16 +506,8 @@ Tensor &HalfTensor::add(Tensor const &m, Tensor &output,
                         float const alpha) const {
   auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) {
-      ele_add(e.buffer_size, buf, m_buf, out_buf);
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha);
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += strides[3];
-      }
-    }
+    ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+            strides[3]);
   };
   apply_broadcast(m, f, output);
   return output;
@@ -1035,17 +1027,7 @@ Tensor &HalfTensor::divide(float const &value, Tensor &output) const {
 Tensor &HalfTensor::divide(Tensor const &m, Tensor &output) const {
   auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::divides<_FP16>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf / *m_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
   };
 
   apply_broadcast(m, f, output);
@@ -1136,7 +1118,14 @@ Tensor &HalfTensor::transpose(const std::string &direction,
       }
     } else {
       if (is_format_nchw) {
-        transposeloop(l, i, k, j, SL, SI, SK, SJ);
+        for (unsigned int b = 0; b < batch(); ++b) {
+          for (unsigned int c = 0; c < channel(); ++c) {
+            transpose_matrix(
+              height(), width(), (_FP16 *)getData() + getIndex(b, c, 0, 0),
+              width(), (_FP16 *)output.getData() + output.getIndex(b, c, 0, 0),
+              output.width());
+          }
+        }
       } else {
         transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
       }
diff --git a/nntrainer/tensor/half_tensor.h b/nntrainer/tensor/half_tensor.h
index 6ca35e4f..43988207 100644
--- a/nntrainer/tensor/half_tensor.h
+++ b/nntrainer/tensor/half_tensor.h
@@ -267,6 +267,13 @@ public:
    */
   int add_i(Tensor const &m, Tensor &output, float const alpha) override;
 
+  /**
+   * @copydoc Tensor::add_i_partial()
+   */
+  int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                    unsigned int incX, unsigned int incY, const Tensor alphas,
+                    unsigned int alpha_idx) override;
+
   /**
    * @copydoc Tensor::add(float const &value, Tensor &output)
    */
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 28bc271f..2c815e29 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -251,6 +251,20 @@ Tensor Tensor::multiply(Tensor const &m, const float beta) const {
 
 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
                          const float beta) const {
+  NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
+    << "Tensor Format of " << getName() << ":"
+    << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
+    << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
+
+  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
+                std::invalid_argument)
+    << getName() << " is not contiguous, cannot multiply";
+
+  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
+                std::invalid_argument)
+    << getName() << " is not contiguous, cannot multiply";
   itensor->multiply(m, output, beta);
   return output;
 }
@@ -355,19 +369,8 @@ int Tensor::add_i(Tensor const &m, float const alpha) {
 int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
                           unsigned int incX, unsigned int incY,
                           const Tensor alphas, unsigned int alpha_idx) {
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
-          getAddress<float>(addr_idx), incY);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
-          getAddress<_FP16>(addr_idx), incY);
-#else
-    ml_loge("%s", "Error: enable-fp16 is not enabled");
-    return ML_ERROR_INVALID_PARAMETER;
-#endif
-  }
-  return ML_ERROR_NONE;
+  return itensor->add_i_partial(len, addr_idx, m, incX, incY, alphas,
+                                alpha_idx);
 }
 
 Tensor Tensor::add(Tensor const &m, float const alpha) const {
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 789a4eb6..bfd98978 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -23,6 +23,7 @@
 
 #include <cstddef>
 
+#include <blas_interface.h>
 #include <nntrainer_log.h>
 #include <tensor_base.h>
 
@@ -735,19 +736,19 @@ public:
    */
   int add_i(Tensor const &m, float const alpha = 1.F);
 
-/**
- * @brief Do add_i for specific section
- * 
- * @param len Length of the specific section
- * @param addr_idx Starting index of the psecific section
- * @param m Input Tensor to be added
- * @param incX Incremental index of X
- * @param incY Incremental index of Y
- * @param alphas Vector of multiple alpha values
- * @param alpha_idx Index of alpha in alpha vector
- * @retval #ML_ERROR_NONE  Successful
- * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
- */
+  /**
+   * @brief Do add_i for specific section
+   *
+   * @param len Length of the specific section
+   * @param addr_idx Starting index of the psecific section
+   * @param m Input Tensor to be added
+   * @param incX Incremental index of X
+   * @param incY Incremental index of Y
+   * @param alphas Vector of multiple alpha values
+   * @param alpha_idx Index of alpha in alpha vector
+   * @retval #ML_ERROR_NONE  Successful
+   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   */
   int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
                     unsigned int incX, unsigned int incY, const Tensor alphas,
                     unsigned int alpha_idx);
diff --git a/nntrainer/tensor/tensor_base.h b/nntrainer/tensor/tensor_base.h
index 945b82b3..1831de0a 100644
--- a/nntrainer/tensor/tensor_base.h
+++ b/nntrainer/tensor/tensor_base.h
@@ -283,6 +283,13 @@ public:
    */
   virtual int add_i(Tensor const &m, Tensor &output, float const alpha) = 0;
 
+  /**
+   * @copydoc Tensor::add_i_partial()
+   */
+  virtual int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                            unsigned int incX, unsigned int incY,
+                            const Tensor alphas, unsigned int alpha_idx) = 0;
+
   /**
    * @copydoc Tensor::add(float const &value, Tensor &output)
    */
-- 
2.34.1