[Tensor] Quantized Tensor (Int 4) with Scale
authorDonghyeon Jeong <dhyeon.jeong@samsung.com>
Tue, 5 Sep 2023 02:25:40 +0000 (11:25 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Tue, 12 Sep 2023 03:01:14 +0000 (12:01 +0900)
- Quantized Tensor is now available with Int 4 with scale.
- Two Int 4 values use one Int 8, which each uses 4 bits
- Dequantization is performed by multiplying scaling factors with a given index (b, c, h, w).
- Only read (getValueQint4), write (setValue), and dequantization operations are allowed.

**Self-evaluation:**
1. Build test:   [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <dhyeon.jeong@samsung.com>
api/ccapi/include/tensor_dim.h
nntrainer/tensor/tensor.cpp
nntrainer/tensor/tensor.h
nntrainer/tensor/tensor_dim.cpp
test/unittest/unittest_nntrainer_tensor.cpp
test/unittest/unittest_nntrainer_tensor_fp16.cpp
test/unittest/unittest_nntrainer_tensor_nhwc.cpp
test/unittest/unittest_nntrainer_tensor_pool.cpp

index 0bf24be..62e21dd 100644 (file)
@@ -48,10 +48,11 @@ public:
   enum class Format { NCHW, NHWC };
 
   /**
-   * @brief Tensor Data Type. Currently QINT8, FP16 & FP32 Support
+   * @brief Tensor Data Type. Currently QINT4, QINT8, FP16 & FP32 Support
    *
    */
   enum class DataType {
+    QINT4, /** quantized int 4*/
     QINT8, /** quantized int 8*/
     FP16,  /** half precision */
     FP32   /** single precision */
@@ -94,7 +95,7 @@ public:
    * @brief     Creator of TensorDim with Format & DataType
    *
    * @param fm format NCHW | HNWC
-   * @param fm DataType QINT8 | FP16 | FP32
+   * @param fm DataType QINT4 | QINT8 | FP16 | FP32
    * @param eff_dim_flag_ effective dimension flag (1 means it's effective)
    * @param dyn_dim_flag_ dynamic dimension flag (1 means it's unspecified)
    */
@@ -158,7 +159,7 @@ public:
    * @param h height
    * @param w width
    * @param fm format NCHW | HNWC
-   * @param d_type Data Type QINT8 | FP16 | FP32
+   * @param d_type Data Type QINT4 | QINT8 | FP16 | FP32
    * @param eff_dim_flag_ dimension bit flag to calculate the dynamic
    * dimension, rightmost is width
    */
@@ -187,7 +188,7 @@ public:
    *
    * @param shape shape of format
    * @param fm format NCHW | HNWC
-   * @param d_type data type QINT8 | FP16 | FP32
+   * @param d_type data type QINT4 | QINT8 | FP16 | FP32
    */
   TensorDim(const std::string &shape, TensorDim::Format fm,
             TensorDim::DataType d_type = TensorDim::DataType::FP32);
index 8f3d89d..dd6597e 100644 (file)
@@ -188,6 +188,13 @@ void Tensor::allocate() {
         delete[] mem_data->template getAddr<int8_t>();
         delete mem_data;
       });
+    } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
+      mem_data =
+        new MemoryData((void *)(new int8_t[(dim.getDataLen() + 1) / 2]{}));
+      data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
+        delete[] mem_data->template getAddr<int8_t>();
+        delete mem_data;
+      });
     }
     offset = 0;
     initialize();
@@ -245,6 +252,20 @@ bool Tensor::operator==(const Tensor &rhs) const {
           _data[i] != _rdata[i])
         return false;
     }
+  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    const int8_t *_data = getData<int8_t>();
+    const int8_t *_rdata = rhs.getData<int8_t>();
+    int8_t data, rdata;
+    for (size_t i = 0; i < len; ++i) {
+      /** not checking sign change is intentional to avoid float calculation
+       * errors around 0 */
+      data = decode_qint(_data[i / 2], (i % 2 == 0));
+      rdata = decode_qint(_rdata[i / 2], (i % 2 == 0));
+
+      if ((std::isnan(data) && !std::isnan(rdata)) ||
+          (!std::isnan(data) && std::isnan(rdata)) || data != rdata)
+        return false;
+    }
   }
 
   return true;
@@ -263,6 +284,8 @@ void Tensor::setRandNormal(float mean, float std) {
 #endif
   } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
     throw std::invalid_argument("Error: RandNormal is invalid for QINT8");
+  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    throw std::invalid_argument("Error: RandNormal is invalid for QINT4");
   }
 }
 
@@ -279,6 +302,8 @@ void Tensor::setRandUniform(float min, float max) {
 #endif
   } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
     throw std::invalid_argument("Error: RandUniform is invalid for QINT8");
+  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    throw std::invalid_argument("Error: RandUniform is invalid for QINT4");
   }
 }
 
@@ -295,6 +320,8 @@ void Tensor::setRandBernoulli(float probability) {
 #endif
   } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
     throw std::invalid_argument("Error: setRandBernoulli is invalid for QINT8");
+  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    throw std::invalid_argument("Error: setRandBernoulli is invalid for QINT4");
   }
 }
 
@@ -2713,6 +2740,51 @@ void Tensor::print(std::ostream &out) const {
       }
       out.copyfmt(init);
     }
+  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    const int8_t *data = getData<int8_t>();
+    unsigned int len = size();
+    out << "data addr: " << (float *)data << '\n';
+    out << dim;
+
+    if (len > 100) {
+      out << '[' << (int)data[0] << ' ' << (int)data[1] << ' ' << (int)data[2]
+          << " ... " << (int)data[len - 3] << ' ' << (int)data[len - 2] << ' '
+          << (int)data[len - 1] << ']' << std::endl;
+      return;
+    }
+
+    std::ios init(NULL);
+    init.copyfmt(out);
+    if (getFormat() == Tformat::NCHW) {
+      for (unsigned int k = 0; k < batch(); k++) {
+        for (unsigned int l = 0; l < channel(); l++) {
+          for (unsigned int i = 0; i < height(); i++) {
+            for (unsigned int j = 0; j < width(); j++) {
+              out << std::setw(10) << (int)this->getValueQint4(k, l, i, j)
+                  << " ";
+            }
+            out << std::endl;
+          }
+          out << std::endl;
+        }
+        out << "-------" << std::endl;
+      }
+    } else {
+      for (unsigned int k = 0; k < batch(); k++) {
+        for (unsigned int i = 0; i < height(); i++) {
+          for (unsigned int j = 0; j < width(); j++) {
+            for (unsigned int l = 0; l < channel(); l++) {
+              out << std::setw(10) << (int)this->getValueQint4(k, l, i, j)
+                  << " ";
+            }
+            out << std::endl;
+          }
+          out << std::endl;
+        }
+        out << "-------" << std::endl;
+      }
+      out.copyfmt(init);
+    }
   }
 }
 
@@ -2827,6 +2899,10 @@ void Tensor::copy(const void *buf) {
     if (buf == getData<int8_t>()) {
       return;
     }
+  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    if (buf == getData<int8_t>()) {
+      return;
+    }
   }
 
   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
@@ -2841,6 +2917,10 @@ void Tensor::copy(const void *buf) {
     for (unsigned int i = 0; i < size(); ++i) {
       getData<int8_t>()[i] = ((int8_t *)buf)[i];
     }
+  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    for (unsigned int i = 0; i < (size() + 1) / 2; ++i) {
+      getData<int8_t>()[i] = ((int8_t *)buf)[i];
+    }
   }
 }
 
@@ -3104,6 +3184,10 @@ void Tensor::setValue(float val) {
   } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
     int8_t *data = getData<int8_t>();
     std::fill(data, data + size(), val);
+  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    int8_t *data = getData<int8_t>();
+    int8_t mixed = encode_qint(val, val);
+    std::fill(data, data + (size() + 1) / 2, mixed);
   }
 }
 
@@ -3124,6 +3208,8 @@ void Tensor::setZero() {
 #endif
   } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT8) {
     apply_i<int8_t>([](int8_t val) -> int8_t { return 0; });
+  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    apply_i<int8_t>([](int8_t val) -> int8_t { return 0; });
   }
 }
 
@@ -3422,16 +3508,44 @@ Tensor Tensor::rotate_180(Tensor in) {
   return output;
 }
 
-void Tensor::setScaleFactors(std::vector<float> scales) {
-  if (!scale_factors.empty()) {
-    throw std::invalid_argument("Error: scale factors already been set");
+int8_t Tensor::encode_qint(int8_t high, int8_t low) const {
+  return (high << 4) | (low & 0x0f);
+};
+
+int8_t Tensor::decode_qint(int8_t val, bool isHigh) const {
+  if (isHigh) {
+    val = val >> 4;
+  } else {
+    val = val << 4;
+    val = val >> 4;
+  }
+
+  return val;
+}
+
+void Tensor::setScaleFactors(std::vector<float> scales, int idx) {
+  if (scales.empty() || idx < 0 || idx > 3) {
+    throw std::invalid_argument("Error: invalid parameter");
   }
 
-  if (scales.size() != channel()) {
+  if (idx == 0 && scales.size() != batch()) {
+    throw std::invalid_argument("Error: scale_factors.size() != batch() ");
+  }
+
+  if (idx == 1 && scales.size() != channel()) {
     throw std::invalid_argument("Error: scale_factors.size() != channel() ");
   }
 
+  if (idx == 2 && scales.size() != height()) {
+    throw std::invalid_argument("Error: scale_factors.size() != height() ");
+  }
+
+  if (idx == 3 && scales.size() != width()) {
+    throw std::invalid_argument("Error: scale_factors.size() != width() ");
+  }
+
   scale_factors = scales;
+  scale_idx = idx;
 }
 
 std::vector<float> Tensor::getScaleFactors() { return scale_factors; }
@@ -3447,7 +3561,8 @@ Tensor Tensor::dequantize(Tensor &output) const {
     throw std::invalid_argument("Error: Tensor cannot be dequantized");
   }
 
-  if (output.getDataType() == Tdatatype::QINT8) {
+  if (output.getDataType() == Tdatatype::QINT8 ||
+      output.getDataType() == Tdatatype::QINT4) {
     throw std::invalid_argument("Error: Target datatype is quantized type");
   }
 
@@ -3462,19 +3577,41 @@ Tensor Tensor::dequantize(Tensor &output) const {
     throw std::invalid_argument("Error: No scale factors");
   }
 
-  for (unsigned int c = 0; c < channel(); ++c) {
-    for (unsigned int b = 0; b < batch(); ++b) {
+  int idx;
+  for (unsigned int b = 0; b < batch(); ++b) {
+    for (unsigned int c = 0; c < channel(); ++c) {
       for (unsigned int h = 0; h < height(); ++h) {
         for (unsigned int w = 0; w < width(); ++w) {
+          if (scale_idx == 0)
+            idx = b;
+          else if (scale_idx == 1)
+            idx = c;
+          else if (scale_idx == 2)
+            idx = h;
+          else if (scale_idx == 3)
+            idx = w;
+
           if (output.getDataType() == Tdatatype::FP32) {
-            output.setValue(b, c, h, w,
-                            (float)getValue<int8_t>(b, c, h, w) *
-                              scale_factors[c]);
+            if (getDataType() == Tdatatype::QINT8) {
+              output.setValue(b, c, h, w,
+                              (float)getValue<int8_t>(b, c, h, w) *
+                                scale_factors[idx]);
+            } else {
+              output.setValue(b, c, h, w,
+                              (float)getValueQint4(b, c, h, w) *
+                                scale_factors[idx]);
+            }
           } else if (output.getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-            output.setValue(b, c, h, w,
-                            (_FP16)getValue<int8_t>(b, c, h, w) *
-                              (_FP16)scale_factors[c]);
+            if (getDataType() == Tdatatype::QINT8) {
+              output.setValue(b, c, h, w,
+                              (_FP16)getValue<int8_t>(b, c, h, w) *
+                                (_FP16)scale_factors[idx]);
+            } else {
+              output.setValue(b, c, h, w,
+                              (_FP16)getValueQint4(b, c, h, w) *
+                                (_FP16)scale_factors[idx]);
+            }
 #else
             throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
index 8acb451..9911417 100644 (file)
@@ -358,7 +358,8 @@ public:
         "[Tensor] trying to initialize Tensor from empty vector");
     }
 
-    if (t_type.data_type != Tdatatype::QINT8) {
+    if (t_type.data_type != Tdatatype::QINT8 &&
+        t_type.data_type != Tdatatype::QINT4) {
       throw std::out_of_range(
         "[Tensor] TensorType do not match with input data type");
     }
@@ -382,7 +383,9 @@ public:
     strides = dim.computeStrides();
 
     MemoryData *mem_data =
-      new MemoryData((void *)(new int8_t[dim.getDataLen()]()));
+      (t_type.data_type == Tdatatype::QINT8)
+        ? new MemoryData((void *)(new int8_t[dim.getDataLen()]()))
+        : new MemoryData((void *)(new int8_t[(dim.getDataLen() + 1) / 2]()));
     data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
       delete[] mem_data->getAddr<int8_t>();
     });
@@ -550,6 +553,9 @@ public:
    */
   template <typename T = float>
   const T &getValue(unsigned int idx) const noexcept {
+    if (getDataType() == Tdatatype::QINT4) {
+      return getData<T>()[idx / 2];
+    }
     return getData<T>()[idx];
   }
 
@@ -558,10 +564,63 @@ public:
    * @param[in] idx location
    */
   template <typename T = float> T &getValue(unsigned int idx) noexcept {
+    if (getDataType() == Tdatatype::QINT4) {
+      return getData<T>()[idx / 2];
+    }
     return getData<T>()[idx];
   }
 
   /**
+   * @brief     return value at specific location
+   * @param[in] idx location
+   * @retval    qint4 value in location
+   */
+  int8_t getValueQint4(unsigned int idx) const noexcept {
+    int8_t value = getData<int8_t>()[idx / 2];
+    return decode_qint(value, (idx % 2 == 0));
+  }
+
+  /**
+   * @brief     return value at specific location
+   * @param[in] idx location
+   * @retval    qint4 value in location
+   */
+  int8_t getValueQint4(unsigned int idx) noexcept {
+    int8_t value = getData<int8_t>()[idx / 2];
+    return decode_qint(value, (idx % 2 == 0));
+  }
+
+  /**
+   * @brief     return value at specific location
+   * @param[in] b batch location
+   * @param[in] c channel location
+   * @param[in] h height location
+   * @param[in] w width location
+   * @retval    qint4 value in location
+   */
+  int8_t getValueQint4(unsigned int b, unsigned int c, unsigned int h,
+                       unsigned int w) const noexcept {
+    size_t idx = getIndex(b, c, h, w);
+    int8_t value = getData<int8_t>()[idx / 2];
+    return decode_qint(value, (idx % 2 == 0));
+  }
+
+  /**
+   * @brief     return value at specific location
+   * @param[in] b batch location
+   * @param[in] c channel location
+   * @param[in] h height location
+   * @param[in] w width location
+   * @retval    qint4 value in location
+   */
+  int8_t getValueQint4(unsigned int b, unsigned int c, unsigned int h,
+                       unsigned int w) noexcept {
+    size_t idx = getIndex(b, c, h, w);
+    int8_t value = getData<int8_t>()[idx / 2];
+    return decode_qint(value, (idx % 2 == 0));
+  }
+
+  /**
    * @brief Get the Value thinking that it is padded
    * for example, for the tensor (virtually padded) below,
    * getValue(0, 0, 2, 2, 1, 1, .0f) will return 5
@@ -1198,6 +1257,8 @@ public:
     if (i > index) {
       return nullptr;
     }
+    if (getDataType() == Tdatatype::QINT4)
+      return &getData<T>()[i / 2];
     return &getData<T>()[i];
   }
 
@@ -1211,6 +1272,8 @@ public:
       return nullptr;
     }
 
+    if (getDataType() == Tdatatype::QINT4)
+      return &getData<T>()[i / 2];
     return &getData<T>()[i];
   }
 
@@ -1352,7 +1415,13 @@ public:
    * @brief     Get size of the data in bytes
    * @retval    size_t Size in bytes
    */
-  size_t bytes() const { return size() * dim.getDataTypeSize(); }
+  size_t bytes() const {
+    if (getDataType() == Tdatatype::QINT4) {
+      return (dim.batch() + 1) * (dim.channel() + 1) * (dim.height() + 1) *
+             (dim.width() + 1) / 16 * dim.getDataTypeSize();
+    }
+    return size() * dim.getDataTypeSize();
+  }
 
   /**
    * @brief     Set the element value
@@ -1374,6 +1443,16 @@ public:
 #endif
     } else if (getDataType() == Tdatatype::QINT8) {
       getData<int8_t>()[getIndex(batch, c, h, w)] = value;
+    } else if (getDataType() == Tdatatype::QINT4) {
+      int idx = getIndex(batch, c, h, w);
+
+      if (idx % 2 == 0) {
+        getData<int8_t>()[idx / 2] =
+          encode_qint(value, getData<int8_t>()[idx / 2]);
+      } else {
+        getData<int8_t>()[idx / 2] =
+          encode_qint(getData<int8_t>()[idx / 2] >> 4, value);
+      }
     }
   }
 
@@ -1878,7 +1957,7 @@ public:
    * @brief     Set scale factors of the tensor
    * @param[in] scales scale factors
    */
-  void setScaleFactors(std::vector<float> scales);
+  void setScaleFactors(std::vector<float> scales, int idx);
 
   /**
    * @brief     Get scale factors of the tensor
@@ -1911,6 +1990,7 @@ private:
   std::string name; /**< name of the tensor */
   std::shared_ptr<MemoryData> data;
   size_t offset;
+  int scale_idx;
   std::vector<float> scale_factors;
 
   /**<
@@ -2047,6 +2127,21 @@ private:
    */
   Tensor rotate_180(Tensor in);
 
+  /**
+   * @brief      Encode two int4 values to one int8 value
+   * @param[in]  high value for first 4 bits
+   * @param[in]  low value for last 4 bits
+   * @retval     Encoded value
+   */
+  int8_t encode_qint(int8_t high, int8_t low) const;
+
+  /**
+   * @brief      Decode int8 value to a int4 value
+   * @param[in]  idx index to retrieve value
+   * @retval     Decoded value
+   */
+  int8_t decode_qint(int8_t val, bool isHigh) const;
+
 }; // namespace nntrainer
 
 /**
index 5bb65de..d70c033 100644 (file)
@@ -125,6 +125,8 @@ uint TensorDim::getDataTypeSize() const {
     return sizeof(float);
   case TensorDim::DataType::QINT8:
     return sizeof(int8_t);
+  case TensorDim::DataType::QINT4:
+    return sizeof(int8_t);
   default:
     return sizeof(float);
   }
@@ -344,6 +346,8 @@ std::ostream &operator<<(std::ostream &out, TensorDim const &d) {
     type_ = "FP16";
   } else if (d.getDataType() == ml::train::TensorDim::DataType::QINT8) {
     type_ = "QINT8";
+  } else if (d.getDataType() == ml::train::TensorDim::DataType::QINT4) {
+    type_ = "QINT4";
   }
 
   std::string format_ =
index 0f4731b..71783a3 100644 (file)
@@ -216,6 +216,49 @@ TEST(nntrainer_Tensor, Tensor_04_p) {
   EXPECT_EQ(status, ML_ERROR_NONE);
 }
 
+TEST(nntrainer_Tensor, Tensor_05_p) {
+  int status = ML_ERROR_NONE;
+  std::vector<std::vector<std::vector<int8_t>>> in = {{{-8, -7}, {-6, -5}},
+                                                      {{-4, -3}, {-2, -1}},
+                                                      {{0, 1}, {2, 3}},
+                                                      {{4, 5}, {6, 7}}};
+
+  nntrainer::Tensor tensor = nntrainer::Tensor(
+    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
+  ASSERT_NE(nullptr, tensor.getData<int8_t>());
+
+  for (size_t b = 0; b < tensor.batch(); ++b) {
+    for (size_t c = 0; c < tensor.channel(); ++c) {
+      for (size_t h = 0; h < tensor.height(); ++h) {
+        for (size_t w = 0; w < tensor.width(); ++w) {
+          size_t idx = tensor.getIndex(b, c, h, w);
+          ASSERT_EQ(idx - 8, tensor.getValueQint4(idx));
+        }
+      }
+    }
+  }
+}
+
+TEST(nntrainer_Tensor, Tensor_06_p) {
+  int status = ML_ERROR_NONE;
+  nntrainer::Tensor tensor = nntrainer::Tensor(
+    1, 4, 2, 2, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
+  ASSERT_NE(nullptr, tensor.getData<int8_t>());
+
+  tensor.setValue(-2);
+
+  for (size_t b = 0; b < tensor.batch(); ++b) {
+    for (size_t c = 0; c < tensor.channel(); ++c) {
+      for (size_t h = 0; h < tensor.height(); ++h) {
+        for (size_t w = 0; w < tensor.width(); ++w) {
+          size_t idx = tensor.getIndex(b, c, h, w);
+          ASSERT_EQ(-2, tensor.getValueQint4(idx));
+        }
+      }
+    }
+  }
+}
+
 TEST(nntrainer_Tensor, multiply_i_01_p) {
   int status = ML_ERROR_NONE;
   int batch = 3;
@@ -3288,6 +3331,26 @@ TEST(nntrainer_Tensor, allocate_03_p) {
   EXPECT_TRUE(t.isAllocated());
 }
 
+TEST(nntrainer_Tensor, allocate_04_p) {
+  nntrainer::Tensor t(
+    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
+    true);
+  EXPECT_TRUE(t.isAllocated());
+
+  t.allocate();
+  EXPECT_TRUE(t.isAllocated());
+}
+
+TEST(nntrainer_Tensor, allocate_05_p) {
+  nntrainer::Tensor t(
+    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+    true);
+  EXPECT_TRUE(t.isAllocated());
+
+  t.allocate();
+  EXPECT_TRUE(t.isAllocated());
+}
+
 TEST(nntrainer_Tensor, initialize_01_p) {
   nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Tensor::Initializer::ONES);
 
@@ -3395,6 +3458,20 @@ TEST(nntrainer_Tensor, initialize_08_p) {
   EXPECT_EQ(golden, t);
 }
 
+TEST(nntrainer_Tensor, initialize_09_p) {
+  nntrainer::Tensor t(
+    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}}, true,
+    nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor golden(
+    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}}, true,
+    nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_NE(golden, t);
+
+  golden.initialize(nntrainer::Tensor::Initializer::ONES);
+  EXPECT_EQ(golden, t);
+}
+
 TEST(nntrainer_Tensor, split_01_p) {
   {
     nntrainer::TensorDim ref_dim(3, 2, 4, 5);
@@ -4277,7 +4354,7 @@ TEST(nntrainer_Tensor, dequantize_01_n) {
 
   nntrainer::Tensor input(batch, channel, height, width);
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(batch, channel, height, width);
 
@@ -4297,7 +4374,7 @@ TEST(nntrainer_Tensor, dequantize_02_n) {
     batch + 1, channel, height + 1, width + 1,
     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(batch, channel, height, width);
 
@@ -4338,9 +4415,11 @@ TEST(nntrainer_Tensor, dequantize_04_n) {
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
   EXPECT_THROW(
     {
-      input.setScaleFactors({2.0, 1.5, 1.0, 0.5});
+      input.setScaleFactors({2.0, 1.5, 1.0, 0.5}, 1);
     },
     std::invalid_argument);
+
+  EXPECT_NO_THROW({ input.setScaleFactors({2.0, 1.5, 1.0, 0.5}, 2); });
 }
 
 /**
@@ -4356,7 +4435,7 @@ TEST(nntrainer_Tensor, dequantize_05_n) {
     batch, channel, height, width,
     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(
     batch, channel, height, width,
@@ -4378,7 +4457,7 @@ TEST(nntrainer_Tensor, dequantize_06_p) {
     batch, channel, height, width,
     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output;
 
@@ -4402,33 +4481,140 @@ TEST(nntrainer_Tensor, dequantize_06_p) {
  * @brief dequantize tensor
  */
 TEST(nntrainer_Tensor, dequantize_07_p) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
 
   nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
+    true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor output(batch, channel, height, width);
+
+  // Dequantize by channel
+  EXPECT_NO_THROW(input.setScaleFactors({-2, 2, 4}, 1));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  float answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
+
+  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_1);
+
+  EXPECT_EQ(output, answer1);
+
+  // Dequantize by height
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4.8}, 2));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  float answer_data_2[] = {
+    -4.2, -4.2, -4.2, -4.2, -4.2, -2,   -2,   -2,   -2,   -2,   2,    2,
+    2,    2,    2,    4.8,  4.8,  4.8,  4.8,  4.8,  -4.2, -4.2, -4.2, -4.2,
+    -4.2, -2,   -2,   -2,   -2,   -2,   2,    2,    2,    2,    2,    4.8,
+    4.8,  4.8,  4.8,  4.8,  -4.2, -4.2, -4.2, -4.2, -4.2, -2,   -2,   -2,
+    -2,   -2,   2,    2,    2,    2,    2,    4.8,  4.8,  4.8,  4.8,  4.8};
+  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_2);
+
+  EXPECT_EQ(output, answer2);
+
+  // Dequantize by width
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4, -8}, 3));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  float answer_data_3[] = {
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8};
+
+  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_3);
+
+  EXPECT_EQ(output, answer3);
+}
 
+/**
+ * @brief dequantize qint4 tensor
+ */
+TEST(nntrainer_Tensor, dequantize_08_p) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+    true, nntrainer::Tensor::Initializer::ONES);
   nntrainer::Tensor output(batch, channel, height, width);
 
+  // Dequantize by channel
+  EXPECT_NO_THROW(input.setScaleFactors({-2, 2, 4}, 1));
   EXPECT_NO_THROW({ input.dequantize(output); });
 
-  float answer_data[] = {
-    1.5, 1.5, 1.5, 1.5, 1.5, 3,   3,   3,   3,   3,   4.5, 4.5, 4.5, 4.5, 4.5,
-    6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,
-    8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   5.5, 5.5, 5.5, 5.5, 5.5,
-    6,   6,   6,   6,   6,   6.5, 6.5, 6.5, 6.5, 6.5, 7,   7,   7,   7,   7};
+  float answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
 
-  nntrainer::Tensor answer(ml::train::TensorDim(batch, channel, height, width,
-                                                {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::FP32}),
-                           answer_data);
+  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_1);
 
-  EXPECT_EQ(output, answer);
+  EXPECT_EQ(output, answer1);
+
+  // Dequantize by height
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4}, 2));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  float answer_data_2[] = {-4.2, -4.2, -4.2, -4.2, -4.2, -2, -2, -2, -2, -2,
+                           2,    2,    2,    2,    2,    4,  4,  4,  4,  4,
+                           -4.2, -4.2, -4.2, -4.2, -4.2, -2, -2, -2, -2, -2,
+                           2,    2,    2,    2,    2,    4,  4,  4,  4,  4,
+                           -4.2, -4.2, -4.2, -4.2, -4.2, -2, -2, -2, -2, -2,
+                           2,    2,    2,    2,    2,    4,  4,  4,  4,  4};
+  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_2);
+
+  EXPECT_EQ(output, answer2);
+
+  // Dequantize by width
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4, -8}, 3));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  float answer_data_3[] = {
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8,
+    -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8, -4.2, -2, 2, 4, -8};
+
+  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP32}),
+                            answer_data_3);
+
+  EXPECT_EQ(output, answer3);
 }
 
 int main(int argc, char **argv) {
index eda4b63..67dc946 100644 (file)
@@ -5847,6 +5847,361 @@ TEST(nntrainer_Tensor, TensorPaddedValue_p) {
   }
 }
 
+/**
+ * @brief dequantize FP16 tensor
+ */
+TEST(nntrainer_Tensor, dequantize_01_n) {
+  int batch = 1;
+  int channel = 3;
+  int height = 4;
+  int width = 5;
+
+  nntrainer::Tensor input(batch, channel, height, width,
+                          nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
+  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
+
+  nntrainer::Tensor output(batch, channel, height, width,
+                           nntrainer::Tformat::NCHW,
+                           nntrainer::Tdatatype::FP16);
+
+  EXPECT_THROW({ input.dequantize(output); }, std::invalid_argument);
+}
+
+/**
+ * @brief dequantize tensor with different dimension
+ */
+TEST(nntrainer_Tensor, dequantize_02_n) {
+  int batch = 1;
+  int channel = 3;
+  int height = 4;
+  int width = 5;
+
+  nntrainer::Tensor input(
+    batch + 1, channel, height + 1, width + 1,
+    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
+
+  nntrainer::Tensor output(batch, channel, height, width,
+                           nntrainer::Tformat::NCHW,
+                           nntrainer::Tdatatype::FP16);
+
+  EXPECT_THROW({ input.dequantize(output); }, std::invalid_argument);
+}
+
+/**
+ * @brief dequantize tensor with no scale factors
+ */
+TEST(nntrainer_Tensor, dequantize_03_n) {
+  int batch = 1;
+  int channel = 3;
+  int height = 4;
+  int width = 5;
+
+  nntrainer::Tensor input(
+    batch, channel, height, width,
+    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+
+  nntrainer::Tensor output(batch, channel, height, width,
+                           nntrainer::Tformat::NCHW,
+                           nntrainer::Tdatatype::FP16);
+
+  EXPECT_THROW({ input.dequantize(output); }, std::invalid_argument);
+}
+
+/**
+ * @brief dequantize qint8 tensor to fp16
+ */
+TEST(nntrainer_Tensor, dequantize_04_p) {
+  int batch = 1;
+  int channel = 3;
+  int height = 4;
+  int width = 5;
+
+  nntrainer::Tensor input(
+    batch, channel, height, width,
+    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
+
+  nntrainer::Tensor output;
+
+  EXPECT_NO_THROW({ output = input.dequantize(nntrainer::Tdatatype::FP16); });
+
+  _FP16 answer_data[] = {
+    static_cast<_FP16>(1.5), static_cast<_FP16>(1.5), static_cast<_FP16>(1.5),
+    static_cast<_FP16>(1.5), static_cast<_FP16>(1.5), static_cast<_FP16>(3),
+    static_cast<_FP16>(3),   static_cast<_FP16>(3),   static_cast<_FP16>(3),
+    static_cast<_FP16>(3),   static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
+    static_cast<_FP16>(4.5), static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
+    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
+    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
+    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
+    static_cast<_FP16>(6),   static_cast<_FP16>(7),   static_cast<_FP16>(7),
+    static_cast<_FP16>(7),   static_cast<_FP16>(7),   static_cast<_FP16>(7),
+    static_cast<_FP16>(8),   static_cast<_FP16>(8),   static_cast<_FP16>(8),
+    static_cast<_FP16>(8),   static_cast<_FP16>(8),   static_cast<_FP16>(9),
+    static_cast<_FP16>(9),   static_cast<_FP16>(9),   static_cast<_FP16>(9),
+    static_cast<_FP16>(9),   static_cast<_FP16>(5.5), static_cast<_FP16>(5.5),
+    static_cast<_FP16>(5.5), static_cast<_FP16>(5.5), static_cast<_FP16>(5.5),
+    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
+    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6.5),
+    static_cast<_FP16>(6.5), static_cast<_FP16>(6.5), static_cast<_FP16>(6.5),
+    static_cast<_FP16>(6.5), static_cast<_FP16>(7),   static_cast<_FP16>(7),
+    static_cast<_FP16>(7),   static_cast<_FP16>(7),   static_cast<_FP16>(7)};
+
+  nntrainer::Tensor answer(ml::train::TensorDim(batch, channel, height, width,
+                                                {nntrainer::Tformat::NCHW,
+                                                 nntrainer::Tdatatype::FP16}),
+                           answer_data);
+
+  EXPECT_EQ(output, answer);
+}
+
+/**
+ * @brief dequantize qint8 tensor to fp16
+ */
+TEST(nntrainer_Tensor, dequantize_05_p) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
+    true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor output(batch, channel, height, width,
+                           nntrainer::Tformat::NCHW,
+                           nntrainer::Tdatatype::FP16);
+
+  // Dequantize by channel
+  EXPECT_NO_THROW(input.setScaleFactors({-2, 2, 4}, 1));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
+
+  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_1);
+
+  EXPECT_EQ(output, answer1);
+
+  // Dequantize by height
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4.8}, 2));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
+                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8)};
+  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_2);
+
+  EXPECT_EQ(output, answer2);
+
+  // Dequantize by width
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4, -8}, 3));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8)};
+
+  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_3);
+
+  EXPECT_EQ(output, answer3);
+}
+
+/**
+ * @brief dequantize qint4 tensor
+ */
+TEST(nntrainer_Tensor, dequantize_06_p) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+    true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor output(batch, channel, height, width,
+                           nntrainer::Tformat::NCHW,
+                           nntrainer::Tdatatype::FP16);
+
+  // Dequantize by channel
+  EXPECT_NO_THROW(input.setScaleFactors({-2, 2, 4}, 1));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
+
+  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_1);
+
+  EXPECT_EQ(output, answer1);
+
+  // Dequantize by height
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4}, 2));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(4)};
+  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_2);
+
+  EXPECT_EQ(output, answer2);
+
+  // Dequantize by width
+  EXPECT_NO_THROW(input.setScaleFactors({-4.2, -2, 2, 4, -8}, 3));
+  EXPECT_NO_THROW({ input.dequantize(output); });
+
+  _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
+                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
+                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
+                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
+                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+                           static_cast<_FP16>(4),    static_cast<_FP16>(-8)};
+
+  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
+                                                 {nntrainer::Tformat::NCHW,
+                                                  nntrainer::Tdatatype::FP16}),
+                            answer_data_3);
+
+  EXPECT_EQ(output, answer3);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 
index 37bcd92..e05ba06 100644 (file)
@@ -4688,7 +4688,7 @@ TEST(nntrainer_Tensor, dequantize_01_n) {
     batch, channel, height, width,
     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(
     batch, channel, height, width,
@@ -4710,7 +4710,7 @@ TEST(nntrainer_Tensor, dequantize_02_n) {
     batch, channel, height, width,
     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(
     batch, channel, height, width,
@@ -4732,7 +4732,7 @@ TEST(nntrainer_Tensor, dequantize_03_p) {
     batch, channel, height, width,
     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output;
   output.getDim().setFormat(nntrainer::Tformat::NHWC);
@@ -4766,7 +4766,7 @@ TEST(nntrainer_Tensor, dequantize_04_p) {
     batch, channel, height, width,
     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT8});
   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  input.setScaleFactors({1.5, 1.0, 0.5});
+  input.setScaleFactors({1.5, 1.0, 0.5}, 1);
 
   nntrainer::Tensor output(
     batch, channel, height, width,
@@ -4788,6 +4788,38 @@ TEST(nntrainer_Tensor, dequantize_04_p) {
   EXPECT_EQ(output, answer);
 }
 
+/**
+ * @brief dequantize nhwc qint4 tensor
+ */
+TEST(nntrainer_Tensor, dequantize_05_p) {
+  size_t batch = 1;
+  size_t channel = 10;
+  size_t height = 2;
+  size_t width = 1;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT4}},
+    true, nntrainer::Tensor::Initializer::ONES);
+  input.setScaleFactors({-8, -6, -4, -2, -1, 1, 2, 4, 6, 7}, 1);
+  nntrainer::Tensor output;
+
+  EXPECT_NO_THROW({ output = input.dequantize(nntrainer::Tdatatype::FP32); });
+
+  float answer_data[] = {-8, -6, -4, -2, -1, 1, 2, 4, 6, 7,
+                         -8, -6, -4, -2, -1, 1, 2, 4, 6, 7};
+
+  nntrainer::Tensor answer(ml::train::TensorDim(batch, channel, height, width,
+                                                {nntrainer::Tformat::NHWC,
+                                                 nntrainer::Tdatatype::FP32}),
+                           answer_data);
+
+  EXPECT_EQ(output, answer);
+}
+
 int main(int argc, char **argv) {
   int result = -1;
 
index c898818..ee75538 100644 (file)
@@ -438,7 +438,7 @@ TEST(TensorPool, validate_memory) {
 /**
  * @brief qint8 tensors reuse fp32 tensor memory space
  */
-TEST(TensorPool, validate_memory_reuse_p) {
+TEST(TensorPool, validate_memory_reuse_01_p) {
   // |--------- t1 ---------|
   // |-t2-||-t3-||-t4-||-t5-|
   nntrainer::TensorPool pool;
@@ -497,6 +497,67 @@ TEST(TensorPool, validate_memory_reuse_p) {
 }
 
 /**
+ * @brief qint4 tensors reuse fp32 tensor memory space
+ */
+TEST(TensorPool, validate_memory_reuse_02_p) {
+  // |--------- t1 ---------|
+  // |-t2-||-t3-||-t4-||-t5-|
+  nntrainer::TensorPool pool;
+  nntrainer::Tensor *t1 = nullptr, *t2 = nullptr, *t3 = nullptr, *t4 = nullptr,
+                    *t5 = nullptr;
+
+  EXPECT_NO_THROW(
+    t1 = pool.request("t1", nntrainer::TensorDim({4}), {0},
+                      nntrainer::TensorLifespan::FORWARD_FUNC_LIFESPAN));
+  EXPECT_NE(t1, nullptr);
+  EXPECT_FALSE(t1->isAllocated());
+
+  EXPECT_NO_THROW(
+    t2 = pool.request("t2",
+                      nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
+                                                 nntrainer::Tdatatype::QINT4}),
+                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+  EXPECT_NE(t2, nullptr);
+  EXPECT_FALSE(t2->isAllocated());
+
+  EXPECT_NO_THROW(
+    t3 = pool.request("t3",
+                      nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
+                                                 nntrainer::Tdatatype::QINT4}),
+                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+  EXPECT_NE(t3, nullptr);
+  EXPECT_FALSE(t3->isAllocated());
+
+  EXPECT_NO_THROW(
+    t4 = pool.request("t4",
+                      nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
+                                                 nntrainer::Tdatatype::QINT4}),
+                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+  EXPECT_NE(t4, nullptr);
+  EXPECT_FALSE(t4->isAllocated());
+
+  EXPECT_NO_THROW(
+    t5 = pool.request("t5",
+                      nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
+                                                 nntrainer::Tdatatype::QINT4}),
+                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+  EXPECT_NE(t5, nullptr);
+  EXPECT_FALSE(t5->isAllocated());
+
+  EXPECT_NO_THROW(pool.finalize(nntrainer::OptimizedV1Planner(), 0, 2));
+  EXPECT_EQ(pool.minMemoryRequirement(), t1->bytes());
+
+  EXPECT_NO_THROW(pool.allocate());
+
+  EXPECT_EQ(t1->getAddress<float>(0), (float *)t2->getAddress<int8_t>(0));
+  EXPECT_EQ(t1->getAddress<float>(1), (float *)t3->getAddress<int8_t>(0));
+  EXPECT_EQ(t1->getAddress<float>(2), (float *)t4->getAddress<int8_t>(0));
+  EXPECT_EQ(t1->getAddress<float>(3), (float *)t5->getAddress<int8_t>(0));
+
+  EXPECT_NO_THROW(pool.deallocate());
+}
+
+/**
  * @brief check if data span of two tensor testOverlap
  *
  * @param t1 tensor1