#define sgemv_loop_fp16(ci, cj, cM, cN) \
do { \
- _Float16 y0; \
+ _FP16 y0; \
unsigned int i, j; \
for (ci = 0; ci != cM; ci++) { \
- y0 = Y[ci * incy] * static_cast<_Float16>(beta); \
+ y0 = Y[ci * incy] * static_cast<_FP16>(beta); \
for (cj = 0; cj != cN; cj++) \
y0 += A[i + j * lda] * X[cj * incx]; \
Y[ci * incy] = y0; \
namespace nntrainer {
#ifdef ENABLE_FP16
-static void saxpy_FP16(const unsigned int N, const float alpha, const _Float16 *X,
- const int incX, _Float16 *Y, const int incY) {
+static void saxpy_FP16(const unsigned int N, const float alpha, const _FP16 *X,
+ const int incX, _FP16 *Y, const int incY) {
if (incX < 0 or incY < 0)
throw std::invalid_argument(
"Error: negative inc not supported without cblas");
for (unsigned int i = 0; i < N; ++i)
- Y[i * incY] = Y[i * incY] + static_cast<_Float16>(alpha) * X[i * incX];
+ Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX];
}
static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
const unsigned int M, const unsigned int N,
- const float alpha, const _Float16 *A,
- const unsigned int lda, const _Float16 *X, const int incX,
- const float beta, _Float16 *Y, const int incY) {
+ const float alpha, const _FP16 *A,
+ const unsigned int lda, const _FP16 *X, const int incX,
+ const float beta, _FP16 *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);
}
}
-static _Float16 sdot_FP16(const unsigned int N, const _Float16 *X,
- const unsigned int incX, const _Float16 *Y,
+static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
+ const unsigned int incX, const _FP16 *Y,
const unsigned int incY) {
- _Float16 ret = 0;
+ _FP16 ret = 0;
for (unsigned int i = 0; i < N; ++i) {
ret += X[i * incX] * Y[i * incY];
}
return ret;
}
-static void scopy_FP16(const unsigned int N, const _Float16 *X, const int incX,
- _Float16 *Y, const int incY) {
+static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX,
+ _FP16 *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);
Y[i * incy] = X[i * incx];
}
-void sscal(const unsigned int N, const float alpha, _Float16 *X, const int incX) {
+void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
unsigned int incx = abs(incX);
for (unsigned int i = 0; i < N; ++i)
- X[i * incx] = static_cast<_Float16>(alpha) * X[i * incx];
+ X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
}
-static _Float16 snrm2_FP16(const unsigned int N, const _Float16 *X, const int incX) {
+static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) {
unsigned int incx = abs(incX);
- _Float16 sum = 0;
- _Float16 tmp;
+ _FP16 sum = 0;
+ _FP16 tmp;
#pragma omp parallel for private(tmp) reduction(+ : sum)
for (unsigned int i = 0; i < N; i++) {
tmp = X[i * incx];
sum += tmp * tmp;
}
- return static_cast<_Float16>(sqrt(sum));
+ return static_cast<_FP16>(sqrt(sum));
}
static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
CBLAS_TRANSPOSE TransB, const unsigned int M,
const unsigned int N, const unsigned int K,
- const float alpha, const _Float16 *A,
- const unsigned int lda, const _Float16 *B,
- const unsigned int ldb, const float beta, _Float16 *C,
+ const float alpha, const _FP16 *A,
+ const unsigned int lda, const _FP16 *B,
+ const unsigned int ldb, const float beta, _FP16 *C,
const unsigned int ldc) {
for (unsigned int m = 0; m < M; ++m) {
for (unsigned int n = 0; n < N; ++n) {
- _Float16 c = 0;
- _Float16 c_old = C[m * ldc + n];
+ _FP16 c = 0;
+ _FP16 c_old = C[m * ldc + n];
for (unsigned int k = 0; k < K; ++k) {
- _Float16 a, b;
+ _FP16 a, b;
a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
c += a * b;
}
- C[m * ldc + n] = static_cast<_Float16>(alpha) * c;
+ C[m * ldc + n] = static_cast<_FP16>(alpha) * c;
if (beta != 0.0)
- C[m * ldc + n] += static_cast<_Float16>(beta) * c_old;
+ C[m * ldc + n] += static_cast<_FP16>(beta) * c_old;
}
}
}
-static unsigned int isamax_FP16(const unsigned int N, const _Float16 *X,
+static unsigned int isamax_FP16(const unsigned int N, const _FP16 *X,
const int incX) {
unsigned int max_idx = 0;
- _Float16 max_val = X[0];
+ _FP16 max_val = X[0];
for (unsigned int n = 1; n < N; n += incX) {
- _Float16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
+ _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
if (cur_val > max_val) {
max_val = cur_val;
max_idx = n;
return max_idx;
}
-void saxpy(const unsigned int N, const float alpha, const _Float16 *X,
- const int incX, _Float16 *Y, const int incY) {
+void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
+ const int incX, _FP16 *Y, const int incY) {
saxpy_FP16(N, alpha, X, incX, Y, incY);
}
void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
- const float alpha, const _Float16 *A, const unsigned int lda,
- const _Float16 *B, const unsigned int ldb, const float beta, _Float16 *C,
+ const float alpha, const _FP16 *A, const unsigned int lda,
+ const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
const unsigned int ldc) {
sgemm_FP16(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
ldc);
}
-void scopy(const unsigned int N, const _Float16 *X, const int incX, _Float16 *Y,
+void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y,
const int incY) {
scopy_FP16(N, X, incX, Y, incY);
} // namespace nntrainer
-_Float16 snrm2(const int N, const _Float16 *X, const int incX) {
+_FP16 snrm2(const int N, const _FP16 *X, const int incX) {
return snrm2_FP16(N, X, incX);
}
-_Float16 sdot(const unsigned int N, const _Float16 *X, const unsigned int incX,
- const _Float16 *Y, const unsigned int incY) {
+_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
+ const _FP16 *Y, const unsigned int incY) {
return sdot_FP16(N, X, incX, Y, incY);
}
void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
- const unsigned int N, const float alpha, const _Float16 *A,
- const unsigned int lda, const _Float16 *X, const int incX,
- const float beta, _Float16 *Y, const int incY) {
+ const unsigned int N, const float alpha, const _FP16 *A,
+ const unsigned int lda, const _FP16 *X, const int incX,
+ const float beta, _FP16 *Y, const int incY) {
sgemv_FP16(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}
-unsigned int isamax(const unsigned int N, const _Float16 *X, const int incX) {
+unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) {
/// @todo isamax_FP16 for BLAS_NUM_THREADS
return isamax_FP16(N, X, incX);
}
sscal_raw(N, alpha, (float *)X, incX);
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- sscal(N, alpha, (_Float16 *)X, incX);
+ sscal(N, alpha, (_FP16 *)X, incX);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
static_cast<float *>(Y), incY);
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- saxpy_FP16(N, alpha, static_cast<const _Float16 *>(X), incX,
- static_cast<_Float16 *>(Y), incY);
+ saxpy_FP16(N, alpha, static_cast<const _FP16 *>(X), incX,
+ static_cast<_FP16 *>(Y), incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
sgemm_FP16(order, TransA, TransB, M, N, K, alpha,
- static_cast<const _Float16 *>(A), lda,
- static_cast<const _Float16 *>(B), ldb, beta,
- static_cast<_Float16 *>(C), ldc);
+ static_cast<const _FP16 *>(A), lda,
+ static_cast<const _FP16 *>(B), ldb, beta,
+ static_cast<_FP16 *>(C), ldc);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
scopy_raw(N, (float *)X, incX, (float *)Y, incY);
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- scopy_FP16(N, (_Float16 *)X, incX, (_Float16 *)Y, incY);
+ scopy_FP16(N, (_FP16 *)X, incX, (_FP16 *)Y, incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
return sgemv_FP16(order, TransA, M, N, alpha,
- static_cast<const _Float16 *>(A), lda,
- static_cast<const _Float16 *>(X), incX, beta,
- static_cast<_Float16 *>(Y), incY);
+ static_cast<const _FP16 *>(A), lda,
+ static_cast<const _FP16 *>(X), incX, beta,
+ static_cast<_FP16 *>(Y), incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- mem_data = new MemoryData((void *)(new _Float16[dim.getDataLen()]{}));
+ mem_data = new MemoryData((void *)(new _FP16[dim.getDataLen()]{}));
data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
- delete[] mem_data->template getAddr<_Float16>();
+ delete[] mem_data->template getAddr<_FP16>();
delete mem_data;
});
#else
}
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *_data = getData<_Float16>();
- const _Float16 *_rdata = rhs.getData<_Float16>();
+ const _FP16 *_data = getData<_FP16>();
+ const _FP16 *_rdata = rhs.getData<_FP16>();
for (size_t i = 0; i < len; ++i) {
// @todo: need to check if float casting valid
if ((std::isnan((float)_data[i]) && !std::isnan((float)_rdata[i])) ||
setDist<float, std::normal_distribution<float>>(
std::normal_distribution<float>(mean, std));
} else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
- throw std::invalid_argument(
- "_Float16 is not supported by std::normal_distribution");
+ setDist<_FP16, std::normal_distribution<float>>(
+ std::normal_distribution<float>(mean, std));
}
}
setDist<float, std::uniform_real_distribution<float>>(
std::uniform_real_distribution<float>(min, max));
} else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
- throw std::invalid_argument(
- "_Float16 is not supported by std::uniform_real_distribution");
+ setDist<_FP16, std::uniform_real_distribution<float>>(
+ std::uniform_real_distribution<float>(min, max));
}
}
std::bernoulli_distribution(probability));
} else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- setDist<_Float16, std::bernoulli_distribution>(
- std::bernoulli_distribution((_Float16)probability));
+ setDist<_FP16, std::bernoulli_distribution>(
+ std::bernoulli_distribution((_FP16)probability));
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
<< output.getName() << " is not allocated";
} else if (getDataType() == Tdatatype::FP16) {
#ifdef ENABLE_FP16
- NNTR_THROW_IF(getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
<< getName() << " is not allocated";
- NNTR_THROW_IF(m.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
<< m.getName() << " is not allocated";
- NNTR_THROW_IF(output.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
<< output.getName() << " is not allocated";
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
output.addValue(b, c, h, w,
- getValue<_Float16>(b, c, h, w) *
- m.getValue<_Float16>(b, c, h, w),
+ getValue<_FP16>(b, c, h, w) *
+ m.getValue<_FP16>(b, c, h, w),
beta);
}
}
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
- _Float16 *out_data = output.getAddress<_Float16>(b, c, h, 0);
- const _Float16 *m_data = m.getAddress<_Float16>(b, c, h, 0);
- const _Float16 *in_data = getAddress<_Float16>(b, c, h, 0);
+ _FP16 *out_data = output.getAddress<_FP16>(b, c, h, 0);
+ const _FP16 *m_data = m.getAddress<_FP16>(b, c, h, 0);
+ const _FP16 *in_data = getAddress<_FP16>(b, c, h, 0);
std::transform(in_data, in_data + width(), m_data, out_data,
- std::multiplies<_Float16>());
+ std::multiplies<_FP16>());
}
}
}
for (unsigned int w = 0; w < width(); ++w) {
for (unsigned int c = 0; c < channel(); ++c) {
output.addValue(b, c, h, w,
- getValue<_Float16>(b, c, h, w) *
- m.getValue<_Float16>(b, c, h, w),
+ getValue<_FP16>(b, c, h, w) *
+ m.getValue<_FP16>(b, c, h, w),
beta);
}
}
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
- _Float16 *out_data = output.getAddress<_Float16>(b, 0, h, w);
- const _Float16 *m_data = m.getAddress<_Float16>(b, 0, h, w);
- const _Float16 *in_data = getAddress<_Float16>(b, 0, h, w);
+ _FP16 *out_data = output.getAddress<_FP16>(b, 0, h, w);
+ const _FP16 *m_data = m.getAddress<_FP16>(b, 0, h, w);
+ const _FP16 *in_data = getAddress<_FP16>(b, 0, h, w);
std::transform(in_data, in_data + channel(), m_data, out_data,
- std::multiplies<_Float16>());
+ std::multiplies<_FP16>());
}
}
}
<< output.getName() << " is not allocated";
} else if (getDataType() == Tdatatype::FP16) {
#ifdef ENABLE_FP16
- NNTR_THROW_IF(getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
<< getName() << " is not allocated";
- NNTR_THROW_IF(m.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
<< m.getName() << " is not allocated";
- NNTR_THROW_IF(output.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
<< output.getName() << " is not allocated";
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
output.setValue(b, c, h, w,
- getValue<_Float16>(b, c, h, w) +
- m.getValue<_Float16>(b, c, h, w) * beta);
+ getValue<_FP16>(b, c, h, w) +
+ m.getValue<_FP16>(b, c, h, w) * beta);
}
}
}
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
- _Float16 *out_data = output.getAddress<_Float16>(b, c, h, 0);
- const _Float16 *m_data = m.getAddress<_Float16>(b, c, h, 0);
- const _Float16 *in_data = getAddress<_Float16>(b, c, h, 0);
+ _FP16 *out_data = output.getAddress<_FP16>(b, c, h, 0);
+ const _FP16 *m_data = m.getAddress<_FP16>(b, c, h, 0);
+ const _FP16 *in_data = getAddress<_FP16>(b, c, h, 0);
std::transform(in_data, in_data + width(), m_data, out_data,
- std::plus<_Float16>());
+ std::plus<_FP16>());
}
}
}
for (unsigned int w = 0; w < width(); ++w) {
for (unsigned int c = 0; c < channel(); ++c) {
output.setValue(b, c, h, w,
- getValue<_Float16>(b, c, h, w) +
- m.getValue<_Float16>(b, c, h, w) * beta);
+ getValue<_FP16>(b, c, h, w) +
+ m.getValue<_FP16>(b, c, h, w) * beta);
}
}
}
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
- _Float16 *out_data = output.getAddress<_Float16>(b, 0, h, w);
- const _Float16 *m_data = m.getAddress<_Float16>(b, 0, h, w);
- const _Float16 *in_data = getAddress<_Float16>(b, 0, h, w);
+ _FP16 *out_data = output.getAddress<_FP16>(b, 0, h, w);
+ const _FP16 *m_data = m.getAddress<_FP16>(b, 0, h, w);
+ const _FP16 *in_data = getAddress<_FP16>(b, 0, h, w);
std::transform(in_data, in_data + channel(), m_data, out_data,
- std::plus<_Float16>());
+ std::plus<_FP16>());
}
}
}
sscal(len, value, data, 1);
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- _Float16 *data = getData<_Float16>();
+ _FP16 *data = getData<_FP16>();
unsigned int len = size();
sscal(len, value, data, 1);
#else
Tensor &Tensor::multiply(float const &value, Tensor &out) const {
/// @todo add unittest
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ // if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
return apply(f, out);
- } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- auto f = std::bind(std::multiplies<_Float16>(), std::placeholders::_1,
- static_cast<_Float16>(value));
- return apply(f, out);
-#else
- throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
- }
+// } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+// #ifdef ENABLE_FP16
+// auto f = std::bind(std::multiplies<_FP16>(), std::placeholders::_1,
+// static_cast<_FP16>(value));
+// return apply(f, out);
+// #else
+// throw std::invalid_argument("Error: enable-fp16 is not enabled");
+// #endif
+ // }
return out;
}
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- auto f = [&](const BroadcastInfo &e, const _Float16 *buf,
- const _Float16 *m_buf, _Float16 *out_buf) {
+ auto f = [&](const BroadcastInfo &e, const _FP16 *buf,
+ const _FP16 *m_buf, _FP16 *out_buf) {
if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
beta == 0.0) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::multiplies<_Float16>());
+ std::multiplies<_FP16>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf * *m_buf + static_cast<_Float16>(beta) * *out_buf;
+ *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf;
buf += strides[3];
m_buf += e.strides[3];
out_buf += output.strides[3];
}
Tensor &Tensor::divide(float const &value, Tensor &out) const {
- /// @todo add unittest, _Float16 ZeroDivisionError
+ /// @todo add unittest, _FP16 ZeroDivisionError
if (value == 0.0f) {
std::stringstream ss;
ss << "[Tensor] divide by value failed, value: " << value;
throw std::invalid_argument(ss.str().c_str());
}
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ // if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
return apply(f, out);
- } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- auto f = std::bind(std::divides<_Float16>(), std::placeholders::_1, static_cast<_Float16>(value));
- return apply(f, out);
-#else
- throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
- }
+// } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+// #ifdef ENABLE_FP16
+// auto f = std::bind(std::divides<_FP16>(), std::placeholders::_1, static_cast<_FP16>(value));
+// return apply(f, out);
+// #else
+// throw std::invalid_argument("Error: enable-fp16 is not enabled");
+// #endif
+// }
return out;
}
apply_broadcast(m, f, output);
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- auto f = [&](const BroadcastInfo &e, const _Float16 *buf,
- const _Float16 *m_buf, _Float16 *out_buf) {
+ auto f = [&](const BroadcastInfo &e, const _FP16 *buf,
+ const _FP16 *m_buf, _FP16 *out_buf) {
if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::divides<_Float16>());
+ std::divides<_FP16>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf / *m_buf;
Tensor &Tensor::add(float const &value, Tensor &out) const {
/// @todo add unittest
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ // if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
return apply(f, out);
- } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- auto f = std::bind(std::plus<_Float16>(), std::placeholders::_1,
- static_cast<_Float16>(value));
- return apply(f, out);
-#else
- throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
- }
+// } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+// #ifdef ENABLE_FP16
+// auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
+// static_cast<_FP16>(value));
+// return apply(f, out);
+// #else
+// throw std::invalid_argument("Error: enable-fp16 is not enabled");
+// #endif
+// }
return out;
}
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- auto f = [&](const BroadcastInfo &e, const _Float16 *buf,
- const _Float16 *m_buf, _Float16 *out_buf) {
+ auto f = [&](const BroadcastInfo &e, const _FP16 *buf,
+ const _FP16 *m_buf, _FP16 *out_buf) {
saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
- /// @todo: saxpy is not valid for _Float16
+ /// @todo: saxpy is not valid for _FP16
};
/// @todo: enable this after add_strided supports broadcast
apply_broadcast(m, f, output);
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- auto f = [&](const BroadcastInfo &e, const _Float16 *buf,
- const _Float16 *m_buf, _Float16 *out_buf) {
+ auto f = [&](const BroadcastInfo &e, const _FP16 *buf,
+ const _FP16 *m_buf, _FP16 *out_buf) {
if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
alpha == 0) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::plus<_Float16>());
+ std::plus<_FP16>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf + *m_buf * static_cast<_Float16>(alpha);
+ *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha);
buf += strides[3];
m_buf += e.strides[3];
out_buf += strides[3];
Tensor &Tensor::subtract(float const &value, Tensor &out) const {
/// @todo add unittest
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ // if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
return apply(f, out);
- } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- auto f = std::bind(std::minus<_Float16>(), std::placeholders::_1,
- static_cast<_Float16>(value));
- return apply(f, out);
-#else
- ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
- }
+// } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+// #ifdef ENABLE_FP16
+// auto f = std::bind(std::minus<_FP16>(), std::placeholders::_1,
+// static_cast<_FP16>(value));
+// return apply(f, out);
+// #else
+// ml_loge("%s", "Error: enable-fp16 is not enabled");
+// #endif
+// }
return out; // shouldn't reach
}
}
Tensor &Tensor::pow(float exponent, Tensor &out) const {
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ // if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
auto f = [exponent](float in) { return powf(in, exponent); };
return apply(f, out);
- }
- if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
- auto f = [exponent](_Float16 in) {
- return static_cast<_Float16>(powf(in, exponent));
- };
- return apply(f, out);
-#else
- ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
- }
- return out;
+ // }
+// if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+// #ifdef ENABLE_FP16
+// auto f = [exponent](_FP16 in) {
+// return static_cast<_FP16>(powf(in, exponent));
+// };
+// return apply(f, out);
+// #else
+// ml_loge("%s", "Error: enable-fp16 is not enabled");
+// #endif
+// }
+ // return out;
}
Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
auto iter_value =
[this, is_format_nchw](
std::array<size_t, 4> &loc, const std::array<size_t, 4> &end_loc,
- const std::array<size_t, 4> &reset_dim_arr) -> _Float16 & {
+ const std::array<size_t, 4> &reset_dim_arr) -> _FP16 & {
auto &value = (is_format_nchw)
- ? getValue<_Float16>(loc[0], loc[1], loc[2], loc[3])
- : getValue<_Float16>(loc[0], loc[3], loc[1], loc[2]);
+ ? getValue<_FP16>(loc[0], loc[1], loc[2], loc[3])
+ : getValue<_FP16>(loc[0], loc[3], loc[1], loc[2]);
for (int i = 3; i >= 0; --i) {
loc[i]++;
if (loc[i] == end_loc[i]) {
ret_dims[i].width(), ret_dims[i].channel()};
}
- ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](_Float16 _) {
+ ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
return iter_value(loc, end_loc, reset_dim_arr);
});
}
auto iter_value =
[is_format_nchw](
std::array<unsigned, 4> &loc, const std::array<unsigned, 4> &start_loc,
- Tensor &t, const std::array<unsigned, 4> &ref_dim_arr) -> _Float16 & {
+ Tensor &t, const std::array<unsigned, 4> &ref_dim_arr) -> _FP16 & {
auto &value = is_format_nchw
- ? t.getValue<_Float16>(loc[0], loc[1], loc[2], loc[3])
- : t.getValue<_Float16>(loc[0], loc[3], loc[1], loc[2]);
+ ? t.getValue<_FP16>(loc[0], loc[1], loc[2], loc[3])
+ : t.getValue<_FP16>(loc[0], loc[3], loc[1], loc[2]);
for (int i = 3; i >= 0; --i) {
loc[i]++;
for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
iter_value(loc, start_loc, ret, tensor_dim_arr) =
- t.getValue<_Float16>(i);
+ t.getValue<_FP16>(i);
}
if (is_format_nchw) {
#ifdef ENABLE_FP16
void Tensor::apply_broadcast(
Tensor const &m,
- std::function<void(const BroadcastInfo &e, const _Float16 *, const _Float16 *,
- _Float16 *)>
+ std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
+ _FP16 *)>
v_func,
Tensor &output) const {
CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
- NNTR_THROW_IF(getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
<< getName() << " is not allocated";
- NNTR_THROW_IF(m.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
<< m.getName() << " is not allocated";
- NNTR_THROW_IF(output.getData<_Float16>() == nullptr, std::invalid_argument)
+ NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
<< output.getName() << " is not allocated";
/// shortcut to cover when dimension matches
BroadcastInfo e;
e.buffer_size = size();
e.strides[3] = 1;
- v_func(e, getData<_Float16>(), m.getData<_Float16>(),
- output.getData<_Float16>());
+ v_func(e, getData<_FP16>(), m.getData<_FP16>(),
+ output.getData<_FP16>());
return;
}
void Tensor::apply_broadcast_util(
Tensor const &m,
- std::function<void(const BroadcastInfo &e, const _Float16 *, const _Float16 *,
- _Float16 *)>
+ std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
+ _FP16 *)>
v_func,
Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
size_t m_offset) const {
- const _Float16 *buf = this->getData<_Float16>();
- const _Float16 *m_buf = m.getData<_Float16>();
- _Float16 *out_buf = output.getData<_Float16>();
+ const _FP16 *buf = this->getData<_FP16>();
+ const _FP16 *m_buf = m.getData<_FP16>();
+ _FP16 *out_buf = output.getData<_FP16>();
if (e.buffer_axis == cur_axis) {
v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
ones.getData<float>(), 1, 0.0, rdata, 1);
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
- _Float16 *rdata = ret.getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
+ _FP16 *rdata = ret.getData<_FP16>();
Tensor ones(1, 1, 1, feat_len, this->getTensorType());
- ones.setValue((_Float16)1.0);
+ ones.setValue((_FP16)1.0);
sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
- ones.getData<_Float16>(), 1, 0.0, rdata, 1);
+ ones.getData<_FP16>(), 1, 0.0, rdata, 1);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot sum";
if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
CREATE_IF_EMPTY_DIMS(ret, dim);
- ret.copy(this->getData<_Float16>());
+ ret.copy(this->getData<_FP16>());
return ret;
}
Tensor ones(1, 1, 1, batch, this->getTensorType());
ones.setValue(alpha);
sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
- ones.getData<_Float16>(), 1, beta, ret.getData<_Float16>(), 1);
+ ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
} break;
case 1: {
CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
Tensor ones(1, 1, 1, n, this->getTensorType());
ones.setValue(alpha);
sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
- ones.getData<_Float16>(), 1, beta, ret.getData<_Float16>(), 1);
+ ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
} else {
unsigned int feat_len = dim[2] * dim[3];
unsigned int t_axis = dim[1];
Tensor ones(1, 1, 1, t_axis, getTensorType());
ones.setValue(alpha);
- _Float16 *rdata = ret.getData<_Float16>();
+ _FP16 *rdata = ret.getData<_FP16>();
for (unsigned int k = 0; k < dim[0]; ++k) {
sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
&data[k * dim.getFeatureLen()], feat_len,
- ones.getData<_Float16>(), 1, beta, &rdata[k * feat_len], 1);
+ ones.getData<_FP16>(), 1, beta, &rdata[k * feat_len], 1);
}
}
} break;
unsigned int t_axis = dim[2];
Tensor ones(1, 1, 1, t_axis, getTensorType());
ones.setValue(alpha);
- _Float16 *rdata = ret.getData<_Float16>();
+ _FP16 *rdata = ret.getData<_FP16>();
for (unsigned int k = 0; k < dim[0]; ++k) {
sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
&data[k * dim.getFeatureLen()], feat_len,
- ones.getData<_Float16>(), 1, beta, &rdata[k * feat_len], 1);
+ ones.getData<_FP16>(), 1, beta, &rdata[k * feat_len], 1);
}
} else {
unsigned int t_3 = dim[3];
unsigned int t_axis = dim[2];
Tensor ones(1, 1, 1, t_axis, getTensorType());
ones.setValue(alpha);
- _Float16 *rdata = ret.getData<_Float16>();
+ _FP16 *rdata = ret.getData<_FP16>();
for (unsigned int k = 0; k < dim[0]; ++k) {
for (unsigned int c = 0; c < dim[1]; ++c) {
unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
- ones.getData<_Float16>(), 1, beta, &rdata[ridx], 1);
+ ones.getData<_FP16>(), 1, beta, &rdata[ridx], 1);
}
}
}
unsigned int t_axis = dim[3];
Tensor ones(1, 1, 1, t_axis, getTensorType());
ones.setValue(alpha);
- _Float16 *rdata = ret.getData<_Float16>();
+ _FP16 *rdata = ret.getData<_FP16>();
for (unsigned int k = 0; k < dim[0]; ++k) {
for (unsigned int c = 0; c < dim[2]; ++c) {
unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
- ones.getData<_Float16>(), 1, beta, &rdata[ridx], 1);
+ ones.getData<_FP16>(), 1, beta, &rdata[ridx], 1);
}
}
} else {
Tensor ones(1, 1, 1, n, getTensorType());
ones.setValue(alpha);
sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
- ones.getData<_Float16>(), 1, beta, ret.getData<_Float16>(), 1);
+ ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
}
} break;
default:
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
- const _Float16 *mdata = m.getData<_Float16>();
- _Float16 *rdata = result.getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
+ const _FP16 *mdata = m.getData<_FP16>();
+ _FP16 *rdata = result.getData<_FP16>();
const float alpha = 1.0f;
enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
/// case1: (1 * K) X (K * 1)
if (M == 1 && N == 1) {
*rdata =
- sdot(K, data, 1, mdata, 1) + static_cast<_Float16>(beta) * (*rdata);
+ sdot(K, data, 1, mdata, 1) + static_cast<_FP16>(beta) * (*rdata);
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *inptr = getData<_Float16>();
- _Float16 *outptr = out.getData<_Float16>();
+ const _FP16 *inptr = getData<_FP16>();
+ _FP16 *outptr = out.getData<_FP16>();
switch (indexI) {
case 0:
if (indexJ == 1) {
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- _Float16 scale = static_cast<_Float16>(1.0 / (1 - dropout));
- _Float16 *data_ = getData<_Float16>();
+ _FP16 scale = static_cast<_FP16>(1.0 / (1 - dropout));
+ _FP16 *data_ = getData<_FP16>();
for (unsigned int i = 0; i < size(); ++i) {
if (data_[i] >= dropout)
data_[i] = scale;
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
for (unsigned int b = 0; b < batch(); b++) {
- _Float16 *addr = getAddress<_Float16>(b, 0, 0, 0);
+ _FP16 *addr = getAddress<_FP16>(b, 0, 0, 0);
const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
- std::fill(addr, addr + (*mask_len_val), (_Float16)en_mask_val);
+ std::fill(addr, addr + (*mask_len_val), (_FP16)en_mask_val);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- _Float16 zoneout_fp16 = (_Float16)zoneout;
+ _FP16 zoneout_fp16 = (_FP16)zoneout;
opposite.setRandBernoulli(zoneout_fp16);
- _Float16 *data = getData<_Float16>();
- _Float16 *opposite_data = opposite.getData<_Float16>();
+ _FP16 *data = getData<_FP16>();
+ _FP16 *opposite_data = opposite.getData<_FP16>();
for (unsigned int i = 0; i < size(); ++i) {
if (opposite_data[i] > epsilon) {
- data[i] = (_Float16)0.0;
+ data[i] = (_FP16)0.0;
} else {
- data[i] = (_Float16)1.0;
+ data[i] = (_FP16)1.0;
}
}
#else
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
unsigned int len = size();
out << "data addr: " << data << '\n';
out << dim;
for (unsigned int i = 0; i < height(); i++) {
for (unsigned int j = 0; j < width(); j++) {
out << std::setw(10) << std::setprecision(10)
- << (float)this->getValue<_Float16>(k, l, i, j) << " ";
+ << (float)this->getValue<_FP16>(k, l, i, j) << " ";
}
out << std::endl;
}
for (unsigned int j = 0; j < width(); j++) {
for (unsigned int l = 0; l < channel(); l++) {
out << std::setw(10) << std::setprecision(10)
- << (float)this->getValue<_Float16>(k, l, i, j) << " ";
+ << (float)this->getValue<_FP16>(k, l, i, j) << " ";
}
out << std::endl;
}
if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- if (buf == getData<_Float16>()) {
+ if (buf == getData<_FP16>()) {
return;
}
#else
if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- scopy(size(), (_Float16 *)buf, 1, getData<_Float16>(), 1);
+ scopy(size(), (_FP16 *)buf, 1, getData<_FP16>(), 1);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
- setValue(b, c, h, w, from.getValue<_Float16>(b, c, h, w));
+ setValue(b, c, h, w, from.getValue<_FP16>(b, c, h, w));
}
}
}
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
- setValue(b, c, h, w, from.getValue<_Float16>(b, c, h, w));
+ setValue(b, c, h, w, from.getValue<_FP16>(b, c, h, w));
}
}
}
copy(from.getData());
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- copy(from.getData<_Float16>());
+ copy(from.getData<_FP16>());
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
std::fill(data, data + size(), val);
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- _Float16 *data = getData<_Float16>();
- std::fill(data, data + size(), static_cast<_Float16>(val));
+ _FP16 *data = getData<_FP16>();
+ std::fill(data, data + size(), static_cast<_FP16>(val));
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
if (contiguous)
- sscal(size(), 0, getData<_Float16>(), 1);
+ sscal(size(), 0, getData<_FP16>(), 1);
else
- apply_i([](_Float16 val) -> _Float16 { return 0; });
+ apply_i([](float val) -> float { return 0; });
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
size_t batch_size = batch();
size_t feature_len = dim.getFeatureLen();
ret = snrm2(len, data, 1);
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
ret = snrm2(len, data, 1);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
unsigned int idx = isamax(len, data, 1);
ret = *(data + idx);
}
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
- const _Float16 *data = getData<_Float16>();
+ const _FP16 *data = getData<_FP16>();
auto bounds = std::minmax_element(data, data + size());
- const _Float16 min = *bounds.first;
- const _Float16 max = *bounds.second;
+ const _FP16 min = *bounds.first;
+ const _FP16 max = *bounds.second;
if (max == min) {
Tensor tmp = *this;
#ifdef ENABLE_FP16
Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
std_dev_by_batch.setZero();
- _Float16 *std_dev = std_dev_by_batch.getData<_Float16>();
+ _FP16 *std_dev = std_dev_by_batch.getData<_FP16>();
for (unsigned int k = 0; k < dim.batch(); ++k) {
Tensor sub_this = this->getBatchSlice(k, 1);
- std_dev[k] = static_cast<_Float16>(sub_this.l2norm());
+ std_dev[k] = static_cast<_FP16>(sub_this.l2norm());
}
std_dev_by_batch.divide_i(dim.getFeatureLen());
for (unsigned int k = 0; k < in.height(); ++k) {
for (unsigned int l = 0; l < in.width(); ++l) {
output.setValue(i, j, k, l,
- in.getValue<_Float16>(i, j, (in.height() - k - 1),
+ in.getValue<_FP16>(i, j, (in.height() - k - 1),
(in.width() - l - 1)));
}
}
Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
#ifdef ENABLE_FP16
- Tensor(std::vector<std::vector<std::vector<std::vector<_Float16>>>> const &d,
+ Tensor(std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
ml::train::TensorDim::TensorType t_type) {
if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
strides = dim.computeStrides();
MemoryData *mem_data =
- new MemoryData((void *)(new _Float16[dim.getDataLen()]()));
+ new MemoryData((void *)(new _FP16[dim.getDataLen()]()));
data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
- delete[] mem_data->getAddr<_Float16>();
+ delete[] mem_data->getAddr<_FP16>();
});
offset = 0;
contiguous = true;
* @note This constructor copies vector again. needs refactoring
* @param[in] d data for the Tensor
*/
- Tensor(std::vector<std::vector<std::vector<_Float16>>> const &d,
+ Tensor(std::vector<std::vector<std::vector<_FP16>>> const &d,
ml::train::TensorDim::TensorType t_type) :
Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
* @note This constructor copies vector again. needs refactoring
* @param[in] d data for the Tensor with batch size one
*/
- Tensor(std::vector<std::vector<_Float16>> const &d,
+ Tensor(std::vector<std::vector<_FP16>> const &d,
ml::train::TensorDim::TensorType t_type) :
Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
* @param[out] output output tensor
* @retval Tensor
*/
+
Tensor &apply(std::function<float(float)> f, Tensor &output) const {
CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
}
}
}
- }
- return output;
- };
-
- /**
- * @brief Apply instantly to the element
- *
- * @param f function to apply
- * @return int ML_ERROR_NONE if successful
- */
- int apply_i(std::function<_Float16(_Float16)> f) {
- Tensor result = *this;
- apply(f, result);
-
- return ML_ERROR_NONE;
- };
+ } else if (dim.getDataType() == Tdatatype::FP16) {
- /**
- * @brief Apply function element by element
- * @param[in] *function function pointer applied
- * @retval Tensor
- */
- Tensor apply(std::function<_Float16(_Float16)> f) const {
- Tensor result;
- return apply(f, result);
- };
+ auto f_16 = [f](_FP16 x) -> _FP16 {
+ return static_cast<_FP16>(f(static_cast<float>(x)));
+ };
- /**
- * @brief Apply function element by element
- * @param[in] *function function pointer applied
- * @param[out] output output tensor
- * @retval Tensor
- */
- Tensor &apply(std::function<_Float16(_Float16)> f, Tensor &output) const {
- CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
+ // std::function<_FP16(_FP16)> f_16 =
+ // static_cast<std::function<_FP16(_FP16)>>(f);
- if (dim != output.dim) {
- /// @todo add unittest
- throw std::invalid_argument(
- "[Tensor::apply] output dimension does not match");
- }
- #ifdef ENABLE_FP16
+
if (contiguous && output.contiguous) {
- const _Float16 *data = (getData<_Float16>());
- _Float16 *rdata = (output.getData<_Float16>());
+ const _FP16 *data = (getData<_FP16>());
+ _FP16 *rdata = (output.getData<_FP16>());
- std::transform(data, data + size(), rdata, f);
+ std::transform(data, data + size(), rdata, f_16);
} else if (strides[3] == 1 && output.strides[3] == 1) {
/** @todo optimize this with combining these loops where stride is 1 */
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
- _Float16 *out_data = (_Float16 *)output.getAddress(b, c, h, 0);
- const _Float16 *in_data = (_Float16 *)getAddress(b, c, h, 0);
- std::transform(in_data, in_data + width(), out_data, f);
+ _FP16 *out_data = output.getAddress<_FP16>(b, c, h, 0);
+ const _FP16 *in_data = getAddress<_FP16>(b, c, h, 0);
+ std::transform(in_data, in_data + width(), out_data, f_16);
}
}
}
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
for (unsigned int w = 0; w < width(); ++w) {
- output.setValue(b, c, h, w,
- f((_Float16)((_Float16)getValue(b, c, h, w))));
+ output.setValue(b, c, h, w, f_16(getValue<_FP16>(b, c, h, w)));
}
}
}
}
}
- #else
- throw std::invalid_argument("Error: enable-fp16 is not enabled");
- #endif
-
+ }
return output;
};
+ // /**
+ // * @brief Apply instantly to the element
+ // *
+ // * @param f function to apply
+ // * @return int ML_ERROR_NONE if successful
+ // */
+ // int apply_i(std::function<_FP16(_FP16)> f) {
+ // Tensor result = *this;
+ // apply(f, result);
+
+ // return ML_ERROR_NONE;
+ // };
+
+ // /**
+ // * @brief Apply function element by element
+ // * @param[in] *function function pointer applied
+ // * @retval Tensor
+ // */
+ // Tensor apply(std::function<_FP16(_FP16)> f) const {
+ // Tensor result;
+ // return apply(f, result);
+ // };
+
+ // /**
+ // * @brief Apply function element by element
+ // * @param[in] *function function pointer applied
+ // * @param[out] output output tensor
+ // * @retval Tensor
+ // */
+ // Tensor &apply(std::function<_FP16(_FP16)> f, Tensor &output) const {
+ // CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
+
+ // if (dim != output.dim) {
+ // /// @todo add unittest
+ // throw std::invalid_argument(
+ // "[Tensor::apply] output dimension does not match");
+ // }
+
+ // #ifdef ENABLE_FP16
+ // if (contiguous && output.contiguous) {
+ // const _FP16 *data = (getData<_FP16>());
+ // _FP16 *rdata = (output.getData<_FP16>());
+
+ // std::transform(data, data + size(), rdata, f);
+ // } else if (strides[3] == 1 && output.strides[3] == 1) {
+ // /** @todo optimize this with combining these loops where stride is 1 */
+ // for (unsigned int b = 0; b < batch(); ++b) {
+ // for (unsigned int c = 0; c < channel(); ++c) {
+ // for (unsigned int h = 0; h < height(); ++h) {
+ // _FP16 *out_data = (_FP16 *)output.getAddress(b, c, h, 0);
+ // const _FP16 *in_data = (_FP16 *)getAddress(b, c, h, 0);
+ // std::transform(in_data, in_data + width(), out_data, f);
+ // }
+ // }
+ // }
+ // } else {
+ // for (unsigned int b = 0; b < batch(); ++b) {
+ // for (unsigned int c = 0; c < channel(); ++c) {
+ // for (unsigned int h = 0; h < height(); ++h) {
+ // for (unsigned int w = 0; w < width(); ++w) {
+ // output.setValue(b, c, h, w,
+ // f((_FP16)((_FP16)getValue(b, c, h, w))));
+ // }
+ // }
+ // }
+ // }
+ // }
+ // #else
+ // throw std::invalid_argument("Error: enable-fp16 is not enabled");
+ // #endif
+
+ // return output;
+ // };
+
/**
* @brief Apply function to Tensor
* @param[in] *function function pointer applied
getData<float>()[getIndex(batch, c, h, w)] = value;
} else if (getDataType() == Tdatatype::FP16) {
#ifdef ENABLE_FP16
- getData<_Float16>()[getIndex(batch, c, h, w)] = static_cast<_Float16>(value);
+ getData<_FP16>()[getIndex(batch, c, h, w)] = static_cast<_FP16>(value);
#else
ml_loge("%s", "Error: enable-fp16 is not enabled");
#endif
getData<float>()[idx] += value;
} else if (dim.getDataType() == Tdatatype::FP16) {
#ifdef ENABLE_FP16
- getData<_Float16>()[idx] *= static_cast<_Float16>(beta);
- getData<_Float16>()[idx] += static_cast<_Float16>(value);
+ getData<_FP16>()[idx] *= static_cast<_FP16>(beta);
+ getData<_FP16>()[idx] += static_cast<_FP16>(value);
#else
ml_loge("%s", "Error: enable-fp16 is not enabled");
#endif
#ifdef ENABLE_FP16
void apply_broadcast_util(
Tensor const &m,
- std::function<void(const BroadcastInfo &e, const _Float16 *, const _Float16 *,
- _Float16 *)>
+ std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
+ _FP16 *)>
v_func,
Tensor &output, const BroadcastInfo &e, int cur_axis = -1,
size_t offset = 0, size_t m_offset = 0) const;
void
apply_broadcast(Tensor const &m,
- std::function<void(const BroadcastInfo &e, const _Float16 *,
- const _Float16 *, _Float16 *)>
+ std::function<void(const BroadcastInfo &e, const _FP16 *,
+ const _FP16 *, _FP16 *)>
v_func,
Tensor &output) const;
#endif