} \
} while (0);
-
#define transposeloop_nhwc(cl, ci, cj, ck, sl, si, sj, sk) \
do { \
unsigned int i, j, k, l; \
} \
} while (0);
-
namespace nntrainer {
/**
*
*/
BroadcastInfo() :
- buffer_size(0),
- buffer_axis(-1),
- strides{0, 0, 0, 0},
- fm(Tformat::NCHW) {}
+ buffer_size(0), buffer_axis(-1), strides{0, 0, 0, 0}, fm(Tformat::NCHW) {}
unsigned int buffer_size; /**< virtual size of the buffer */
int buffer_axis; /**< the smallest axis that should be looped.
-1 means no loop needed*/
std::array<unsigned int, TensorDim::MAXDIM>
- strides; /**< modified strides for the loop */
+ strides; /**< modified strides for the loop */
Tformat fm;
};
SrcSharedTensor() : src(nullptr), off(0) {}
SrcSharedTensor(const Tensor *tensor, size_t offset) :
- src(tensor),
- off(offset) {}
+ src(tensor), off(offset) {}
/**
* @brief Get the allocated src tensor
if (getDataType() == ml::train::TensorDim::DataType::FP32) {
mem_data = new MemoryData((void *)(new float[dim.getDataLen()]()));
data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
- delete[](float *) mem_data->getAddr();
+ delete[] (float *)mem_data->getAddr();
delete mem_data;
});
} else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
mem_data = new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
- delete[](__fp16 *) mem_data->getAddr();
+ delete[] (__fp16 *)mem_data->getAddr();
delete mem_data;
});
}
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
for (unsigned int h = 0; h < height(); ++h) {
- float *out_data = output.getAddress<float>(b, c, h, 0);
- const float *m_data = m.getAddress<float>(b, c, h, 0);
- const float *in_data = getAddress<float>(b, c, h, 0);
+ float *out_data = output.getAddress<__fp16>(b, c, h, 0);
+ const float *m_data = m.getAddress<__fp16>(b, c, h, 0);
+ const float *in_data = getAddress<__fp16>(b, c, h, 0);
std::transform(in_data, in_data + width(), m_data, out_data,
std::multiplies<__fp16>());
}
apply_broadcast(m, f, output);
return output;
-
+
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
__fp16 *out_buf) {
Tensor &Tensor::divide(float const &value, Tensor &out) const {
auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
- /// @todo add unittest
+ /// @todo add unittest, __fp16 ZeroDivisionError
if (value == 0.0f) {
std::stringstream ss;
ss << "[Tensor] divide by value failed, value: " << value;
}
Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
- auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
- float *out_buf) {
- if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::divides<float>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf / *m_buf;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += output.strides[3];
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
+ float *out_buf) {
+ if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
+ std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
+ std::divides<float>());
+ } else {
+ for (unsigned int i = 0; i < e.buffer_size; ++i) {
+ *out_buf = *buf / *m_buf;
+ buf += strides[3];
+ m_buf += e.strides[3];
+ out_buf += output.strides[3];
+ }
}
- }
- };
+ };
- NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
- std::invalid_argument)
- << getName() << " is not contiguous, cannot divide";
+ NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+ std::invalid_argument)
+ << getName() << " is not contiguous, cannot divide";
- apply_broadcast(m, f, output);
+ apply_broadcast(m, f, output);
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
+ __fp16 *out_buf) {
+ if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
+ std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
+ std::divides<__fp16>());
+ } else {
+ for (unsigned int i = 0; i < e.buffer_size; ++i) {
+ *out_buf = *buf / *m_buf;
+ buf += strides[3];
+ m_buf += e.strides[3];
+ out_buf += output.strides[3];
+ }
+ }
+ };
+
+ NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+ std::invalid_argument)
+ << getName() << " is not contiguous, cannot divide";
+
+ apply_broadcast(m, f, output);
+ }
return output;
}
Tensor &Tensor::add(float const &value, Tensor &out) const {
/// @todo add unittest
- auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
- return apply(f, out);
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
+ return apply(f, out);
+ } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = std::bind(std::plus<__fp16>(), std::placeholders::_1, value);
+ return apply(f, out)
+ }
+ return out;
}
int Tensor::add_i(Tensor const &m, float const alpha) {
/// @todo: add axis rather doing add over the last two dimensions always
/// operator i has optimized version
- auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
- float *out_buf) {
- saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
- };
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
+ float *out_buf) {
+ saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
+ };
- /// @todo: enable this after add_strided supports broadcast
- // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
- // << getName() << " is not contiguous, cannot add";
+ /// @todo: enable this after add_strided supports broadcast
+ // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
+ // << getName() << " is not contiguous, cannot add";
- try {
- apply_broadcast(m, f, *this);
- } catch (std::exception &err) {
- ml_loge("%s %s", typeid(err).name(), err.what());
- return ML_ERROR_INVALID_PARAMETER;
- }
+ try {
+ apply_broadcast(m, f, *this);
+ } catch (std::exception &err) {
+ ml_loge("%s %s", typeid(err).name(), err.what());
+ return ML_ERROR_INVALID_PARAMETER;
+ }
- return ML_ERROR_NONE;
+ return ML_ERROR_NONE;
+ } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
+ __fp16 *out_buf) {
+ saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
+ /// @todo: saxpy is not valid for __fp16
+ };
+
+ /// @todo: enable this after add_strided supports broadcast
+ // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
+ // << getName() << " is not contiguous, cannot add";
+
+ try {
+ apply_broadcast(m, f, *this);
+ } catch (std::exception &err) {
+ ml_loge("%s %s", typeid(err).name(), err.what());
+ return ML_ERROR_INVALID_PARAMETER;
+ }
+
+ return ML_ERROR_NONE;
+ }
}
Tensor Tensor::add(Tensor const &m, float const alpha) const {
}
Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
- auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
- float *out_buf) {
- if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) {
- std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
- std::plus<float>());
- } else {
- for (unsigned int i = 0; i < e.buffer_size; ++i) {
- *out_buf = *buf + *m_buf * alpha;
- buf += strides[3];
- m_buf += e.strides[3];
- out_buf += strides[3];
- }
- }
- };
-
NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
std::invalid_argument)
<< getName() << " is not contiguous, cannot add";
- apply_broadcast(m, f, output);
-
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
+ float *out_buf) {
+ if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
+ alpha == 0) {
+ std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
+ std::plus<float>());
+ } else {
+ for (unsigned int i = 0; i < e.buffer_size; ++i) {
+ *out_buf = *buf + *m_buf * alpha;
+ buf += strides[3];
+ m_buf += e.strides[3];
+ out_buf += strides[3];
+ }
+ }
+ };
+ apply_broadcast(m, f, output);
+ } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
+ __fp16 *out_buf) {
+ if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
+ alpha == 0) {
+ std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
+ std::plus<__fp16>());
+ } else {
+ for (unsigned int i = 0; i < e.buffer_size; ++i) {
+ *out_buf = *buf + *m_buf * alpha;
+ buf += strides[3];
+ m_buf += e.strides[3];
+ out_buf += strides[3];
+ }
+ }
+ };
+ apply_broadcast(m, f, output);
+ }
return output;
}
Tensor &Tensor::subtract(float const &value, Tensor &out) const {
/// @todo add unittest
- auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
- return apply(f, out);
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
+ return apply(f, out);
+ } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = std::bind(std::minus<__fp16>(), std::placeholders::_1, value);
+ return apply(f, out);
+ }
}
int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
}
Tensor &Tensor::pow(float exponent, Tensor &out) const {
- auto f = [exponent](float in) { return powf(in, exponent); };
- return apply(f, out);
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto f = [exponent](float in) { return powf(in, exponent); };
+ return apply(f, out);
+ }
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto f = [exponent](__fp16 in) { return powf(in, exponent); };
+ return apply(f, out);
+ }
}
Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
bool is_format_nchw = (dim.getFormat() == Tformat::NCHW);
- auto iter_value = [this, is_format_nchw](
- std::array<size_t, 4> &loc,
- const std::array<size_t, 4> &end_loc,
- const std::array<size_t, 4> &reset_dim_arr) -> float & {
- auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
- : getValue(loc[0], loc[3], loc[1], loc[2]);
- for (int i = 3; i >= 0; --i) {
- loc[i]++;
- if (loc[i] == end_loc[i]) {
- loc[i] -= reset_dim_arr[i];
- continue;
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto iter_value = [this, is_format_nchw](
+ std::array<size_t, 4> &loc,
+ const std::array<size_t, 4> &end_loc,
+ const std::array<size_t, 4> &reset_dim_arr) -> float & {
+ auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
+ : getValue(loc[0], loc[3], loc[1], loc[2]);
+ for (int i = 3; i >= 0; --i) {
+ loc[i]++;
+ if (loc[i] == end_loc[i]) {
+ loc[i] -= reset_dim_arr[i];
+ continue;
+ }
+ break;
}
- break;
- }
- return value;
- };
+ return value;
+ };
- std::vector<Tensor> ret;
- ret.reserve(num_size);
+ std::vector<Tensor> ret;
+ ret.reserve(num_size);
- unsigned int accumulated_size = 0;
- for (unsigned int i = 0; i < num_size; ++i) {
- std::array<size_t, 4> loc = {0, 0, 0, 0};
+ unsigned int accumulated_size = 0;
+ for (unsigned int i = 0; i < num_size; ++i) {
+ std::array<size_t, 4> loc = {0, 0, 0, 0};
- if (is_format_nchw) {
- loc[axis] += accumulated_size;
- } else {
- if (axis == 0) {
- loc[0] += accumulated_size;
- } else if (axis == 1) {
- loc[3] += accumulated_size;
- } else if (axis == 2 || axis == 3) {
- loc[axis - 1] += accumulated_size;
+ if (is_format_nchw) {
+ loc[axis] += accumulated_size;
+ } else {
+ if (axis == 0) {
+ loc[0] += accumulated_size;
+ } else if (axis == 1) {
+ loc[3] += accumulated_size;
+ } else if (axis == 2 || axis == 3) {
+ loc[axis - 1] += accumulated_size;
+ }
}
- }
- ret.emplace_back(ret_dims[i]);
- auto &ret_t = ret.back();
+ ret.emplace_back(ret_dims[i]);
+ auto &ret_t = ret.back();
- std::array<size_t, 4> end_loc;
+ std::array<size_t, 4> end_loc;
- if (is_format_nchw) {
- end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
- ret_dims[i].height(), ret_dims[i].width()};
- } else {
- end_loc = {ret_dims[i].batch(), ret_dims[i].height(), ret_dims[i].width(),
- ret_dims[i].channel()};
- }
+ if (is_format_nchw) {
+ end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
+ ret_dims[i].height(), ret_dims[i].width()};
+ } else {
+ end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
+ ret_dims[i].width(), ret_dims[i].channel()};
+ }
- accumulated_size += sizes[i];
+ accumulated_size += sizes[i];
- if (is_format_nchw) {
- end_loc[axis] = accumulated_size;
- } else {
- if (axis == 0) {
- end_loc[0] = accumulated_size;
- } else if (axis == 1) {
- end_loc[3] = accumulated_size;
- } else if (axis == 2 || axis == 3) {
- end_loc[axis - 1] = accumulated_size;
+ if (is_format_nchw) {
+ end_loc[axis] = accumulated_size;
+ } else {
+ if (axis == 0) {
+ end_loc[0] = accumulated_size;
+ } else if (axis == 1) {
+ end_loc[3] = accumulated_size;
+ } else if (axis == 2 || axis == 3) {
+ end_loc[axis - 1] = accumulated_size;
+ }
}
- }
- std::array<size_t, 4> reset_dim_arr;
- if (is_format_nchw) {
- reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
- ret_dims[i].height(), ret_dims[i].width()};
- } else {
- reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
- ret_dims[i].width(), ret_dims[i].channel()};
+ std::array<size_t, 4> reset_dim_arr;
+ if (is_format_nchw) {
+ reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
+ ret_dims[i].height(), ret_dims[i].width()};
+ } else {
+ reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
+ ret_dims[i].width(), ret_dims[i].channel()};
+ }
+
+ ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
+ return iter_value(loc, end_loc, reset_dim_arr);
+ });
}
- ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
- return iter_value(loc, end_loc, reset_dim_arr);
- });
+ return ret;
}
+ if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto iter_value = [this, is_format_nchw](
+ std::array<size_t, 4> &loc,
+ const std::array<size_t, 4> &end_loc,
+ const std::array<size_t, 4> &reset_dim_arr) -> float & {
+ auto &value = (is_format_nchw)
+ ? getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
+ : getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
+ for (int i = 3; i >= 0; --i) {
+ loc[i]++;
+ if (loc[i] == end_loc[i]) {
+ loc[i] -= reset_dim_arr[i];
+ continue;
+ }
+ break;
+ }
+ return value;
+ };
- return ret;
+ std::vector<Tensor> ret;
+ ret.reserve(num_size);
+
+ unsigned int accumulated_size = 0;
+ for (unsigned int i = 0; i < num_size; ++i) {
+ std::array<size_t, 4> loc = {0, 0, 0, 0};
+
+ if (is_format_nchw) {
+ loc[axis] += accumulated_size;
+ } else {
+ if (axis == 0) {
+ loc[0] += accumulated_size;
+ } else if (axis == 1) {
+ loc[3] += accumulated_size;
+ } else if (axis == 2 || axis == 3) {
+ loc[axis - 1] += accumulated_size;
+ }
+ }
+
+ ret.emplace_back(ret_dims[i]);
+ auto &ret_t = ret.back();
+
+ std::array<size_t, 4> end_loc;
+
+ if (is_format_nchw) {
+ end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
+ ret_dims[i].height(), ret_dims[i].width()};
+ } else {
+ end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
+ ret_dims[i].width(), ret_dims[i].channel()};
+ }
+
+ accumulated_size += sizes[i];
+
+ if (is_format_nchw) {
+ end_loc[axis] = accumulated_size;
+ } else {
+ if (axis == 0) {
+ end_loc[0] = accumulated_size;
+ } else if (axis == 1) {
+ end_loc[3] = accumulated_size;
+ } else if (axis == 2 || axis == 3) {
+ end_loc[axis - 1] = accumulated_size;
+ }
+ }
+
+ std::array<size_t, 4> reset_dim_arr;
+ if (is_format_nchw) {
+ reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
+ ret_dims[i].height(), ret_dims[i].width()};
+ } else {
+ reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
+ ret_dims[i].width(), ret_dims[i].channel()};
+ }
+
+ ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
+ return iter_value(loc, end_loc, reset_dim_arr);
+ });
+ }
+
+ return ret;
+ }
}
Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
[axis](unsigned cur, const Tensor &t) {
return cur += t.getDim().getTensorDim(axis);
});
- auto iter_value =
- [is_format_nchw](std::array<unsigned, 4> &loc,
- const std::array<unsigned, 4> &start_loc, Tensor &t,
- const std::array<unsigned, 4> &ref_dim_arr) -> float & {
-
- auto &value = is_format_nchw
- ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
- : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
-
- for (int i = 3; i >= 0; --i) {
- loc[i]++;
- if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
- loc[i] = start_loc[i];
- continue;
+ if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ auto iter_value =
+ [is_format_nchw](std::array<unsigned, 4> &loc,
+ const std::array<unsigned, 4> &start_loc, Tensor &t,
+ const std::array<unsigned, 4> &ref_dim_arr) -> float & {
+ auto &value = is_format_nchw
+ ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
+ : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
+
+ for (int i = 3; i >= 0; --i) {
+ loc[i]++;
+ if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
+ loc[i] = start_loc[i];
+ continue;
+ }
+ break;
}
- break;
- }
- return value;
- };
-
- auto ret_dim = ref_dim;
- ret_dim.setTensorDim(axis, axis_dim);
-
- auto ret = Tensor(ret_dim);
-
- std::array<unsigned, 4> loc = {0, 0, 0, 0};
- for (auto &t : tensors) {
- std::array<unsigned, 4> start_loc = loc;
- std::array<unsigned, 4> tensor_dim_arr;
- if (is_format_nchw) {
- tensor_dim_arr[0] = t.getDim().getTensorDim(0);
- tensor_dim_arr[1] = t.getDim().getTensorDim(1);
- tensor_dim_arr[2] = t.getDim().getTensorDim(2);
- tensor_dim_arr[3] = t.getDim().getTensorDim(3);
- } else {
- tensor_dim_arr[0] = t.getDim().getTensorDim(0);
- tensor_dim_arr[1] = t.getDim().getTensorDim(2);
- tensor_dim_arr[2] = t.getDim().getTensorDim(3);
- tensor_dim_arr[3] = t.getDim().getTensorDim(1);
- }
+ return value;
+ };
+
+ auto ret_dim = ref_dim;
+ ret_dim.setTensorDim(axis, axis_dim);
+
+ auto ret = Tensor(ret_dim);
- for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
- iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
+ std::array<unsigned, 4> loc = {0, 0, 0, 0};
+ for (auto &t : tensors) {
+ std::array<unsigned, 4> start_loc = loc;
+ std::array<unsigned, 4> tensor_dim_arr;
+ if (is_format_nchw) {
+ tensor_dim_arr[0] = t.getDim().getTensorDim(0);
+ tensor_dim_arr[1] = t.getDim().getTensorDim(1);
+ tensor_dim_arr[2] = t.getDim().getTensorDim(2);
+ tensor_dim_arr[3] = t.getDim().getTensorDim(3);
+ } else {
+ tensor_dim_arr[0] = t.getDim().getTensorDim(0);
+ tensor_dim_arr[1] = t.getDim().getTensorDim(2);
+ tensor_dim_arr[2] = t.getDim().getTensorDim(3);
+ tensor_dim_arr[3] = t.getDim().getTensorDim(1);
+ }
+
+ for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
+ iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
+ }
+
+ if (is_format_nchw) {
+ loc[axis] += t.getDim().getTensorDim(axis);
+ } else {
+ if (axis == 0) {
+ loc[0] += t.getDim().getTensorDim(axis);
+ } else if (axis == 1) {
+ loc[3] += t.getDim().getTensorDim(axis);
+ } else if (axis == 2 || axis == 3) {
+ loc[axis - 1] += t.getDim().getTensorDim(axis);
+ }
+ }
}
- if (is_format_nchw) {
- loc[axis] += t.getDim().getTensorDim(axis);
- } else {
- if (axis == 0) {
- loc[0] += t.getDim().getTensorDim(axis);
- } else if (axis == 1) {
- loc[3] += t.getDim().getTensorDim(axis);
- } else if (axis == 2 || axis == 3) {
- loc[axis - 1] += t.getDim().getTensorDim(axis);
+ return ret;
+ } else if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ auto iter_value =
+ [is_format_nchw](std::array<unsigned, 4> &loc,
+ const std::array<unsigned, 4> &start_loc, Tensor &t,
+ const std::array<unsigned, 4> &ref_dim_arr) -> float & {
+ auto &value = is_format_nchw
+ ? t.getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
+ : t.getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
+
+ for (int i = 3; i >= 0; --i) {
+ loc[i]++;
+ if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
+ loc[i] = start_loc[i];
+ continue;
+ }
+ break;
+ }
+ return value;
+ };
+
+ auto ret_dim = ref_dim;
+ ret_dim.setTensorDim(axis, axis_dim);
+
+ auto ret = Tensor(ret_dim);
+
+ std::array<unsigned, 4> loc = {0, 0, 0, 0};
+ for (auto &t : tensors) {
+ std::array<unsigned, 4> start_loc = loc;
+ std::array<unsigned, 4> tensor_dim_arr;
+ if (is_format_nchw) {
+ tensor_dim_arr[0] = t.getDim().getTensorDim(0);
+ tensor_dim_arr[1] = t.getDim().getTensorDim(1);
+ tensor_dim_arr[2] = t.getDim().getTensorDim(2);
+ tensor_dim_arr[3] = t.getDim().getTensorDim(3);
+ } else {
+ tensor_dim_arr[0] = t.getDim().getTensorDim(0);
+ tensor_dim_arr[1] = t.getDim().getTensorDim(2);
+ tensor_dim_arr[2] = t.getDim().getTensorDim(3);
+ tensor_dim_arr[3] = t.getDim().getTensorDim(1);
+ }
+
+ for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
+ iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
+ }
+
+ if (is_format_nchw) {
+ loc[axis] += t.getDim().getTensorDim(axis);
+ } else {
+ if (axis == 0) {
+ loc[0] += t.getDim().getTensorDim(axis);
+ } else if (axis == 1) {
+ loc[3] += t.getDim().getTensorDim(axis);
+ } else if (axis == 2 || axis == 3) {
+ loc[axis - 1] += t.getDim().getTensorDim(axis);
+ }
}
}
- }
- return ret;
+ return ret;
+ }
}
void Tensor::makeSharedDataTensor(const Tensor &src, size_t offset) {
size_t feat_len = dim.getFeatureLen();
size_t batch = dim.batch();
- const float *data = getData();
- float *rdata = ret.getData();
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData();
+ float *rdata = ret.getData();
+
+ Tensor ones(1, 1, 1, feat_len, this->getFormat());
+ ones.setValue(1.0);
+ sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
+ ones.getData(), 1, 0.0, rdata, 1);
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData();
+ __fp16 *rdata = ret.getData();
- Tensor ones(1, 1, 1, feat_len, this->getFormat());
- ones.setValue(1.0);
- sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
- ones.getData(), 1, 0.0, rdata, 1);
+ Tensor ones(1, 1, 1, feat_len, this->getFormat());
+ ones.setValue((__fp16)1.0);
+ sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
+ ones.getData<__fp16>(), 1, 0.0, rdata, 1);
+ }
return ret;
}
Tensor ret("", this->getFormat());
return sum(axis, ret, alpha, 0);
}
+
Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
float beta) const {
- const float *data = getData<float>();
- NNTR_THROW_IF(!contiguous, std::invalid_argument)
- << getName() << " is not contiguous, cannot sum";
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData<float>();
- if (axis >= 4)
- throw std::out_of_range("Error: axis is invalid");
+ NNTR_THROW_IF(!contiguous, std::invalid_argument)
+ << getName() << " is not contiguous, cannot sum";
- if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
- CREATE_IF_EMPTY_DIMS(ret, dim);
- ret.copy(this->getData());
- return ret;
- }
+ if (axis >= 4)
+ throw std::out_of_range("Error: axis is invalid");
- switch (axis) {
- case 0: {
- CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
- this->getTensorType());
- size_t feat_len = dim.getFeatureLen();
- size_t batch = dim.batch();
- Tensor ones(1, 1, 1, batch, this->getFormat());
- ones.setValue(alpha);
- sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
- ones.getData(), 1, beta, ret.getData(), 1);
- } break;
- case 1: {
- CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
- if (this->getFormat() == Tformat::NHWC) {
- unsigned int m = ret.dim.getDataLen();
- unsigned int n = dim[1];
- Tensor ones(1, 1, 1, n, this->getTensorType());
- ones.setValue(alpha);
- sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
- ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
- } else {
- unsigned int feat_len = dim[2] * dim[3];
- unsigned int t_axis = dim[1];
- Tensor ones(1, 1, 1, t_axis, getTensorType());
- ones.setValue(alpha);
- float *rdata = ret.getData<float>();
- for (unsigned int k = 0; k < dim[0]; ++k) {
- sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
- &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(), 1, beta,
- &rdata[k * feat_len], 1);
- }
+ if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
+ CREATE_IF_EMPTY_DIMS(ret, dim);
+ ret.copy(this->getData());
+ return ret;
}
- } break;
- case 2: {
- CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
-
- if (this->getFormat() == Tformat::NHWC) {
- unsigned int feat_len = dim[1] * dim[3];
- unsigned int t_axis = dim[2];
- Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+
+ switch (axis) {
+ case 0: {
+ CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
+ this->getTensorType());
+ size_t feat_len = dim.getFeatureLen();
+ size_t batch = dim.batch();
+ Tensor ones(1, 1, 1, batch, this->getFormat());
ones.setValue(alpha);
- float *rdata = ret.getData<float>();
- for (unsigned int k = 0; k < dim[0]; ++k) {
- sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
- &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
- 1, beta, &rdata[k * feat_len], 1);
+ sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
+ ones.getData(), 1, beta, ret.getData(), 1);
+ } break;
+ case 1: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int m = ret.dim.getDataLen();
+ unsigned int n = dim[1];
+ Tensor ones(1, 1, 1, n, this->getTensorType());
+ ones.setValue(alpha);
+ sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
+ ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
+ } else {
+ unsigned int feat_len = dim[2] * dim[3];
+ unsigned int t_axis = dim[1];
+ Tensor ones(1, 1, 1, t_axis, getTensorType());
+ ones.setValue(alpha);
+ float *rdata = ret.getData<float>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
+ &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
+ 1, beta, &rdata[k * feat_len], 1);
+ }
}
- } else {
- unsigned int t_3 = dim[3];
- unsigned int t_axis = dim[2];
- Tensor ones(1, 1, 1, t_axis, this->getTensorType());
- ones.setValue(alpha);
- float *rdata = ret.getData<float>();
- for (unsigned int k = 0; k < dim[0]; ++k) {
- for (unsigned int c = 0; c < dim[1]; ++c) {
- unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
- unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
- sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
- ones.getData<float>(), 1, beta, &rdata[ridx], 1);
+ } break;
+ case 2: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
+
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int feat_len = dim[1] * dim[3];
+ unsigned int t_axis = dim[2];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ float *rdata = ret.getData<float>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
+ &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
+ 1, beta, &rdata[k * feat_len], 1);
+ }
+ } else {
+ unsigned int t_3 = dim[3];
+ unsigned int t_axis = dim[2];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ float *rdata = ret.getData<float>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ for (unsigned int c = 0; c < dim[1]; ++c) {
+ unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
+ unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
+ sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
+ ones.getData<float>(), 1, beta, &rdata[ridx], 1);
+ }
}
}
+ } break;
+ case 3: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
+ this->getTensorType());
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int t_3 = dim[1];
+ unsigned int t_axis = dim[3];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ float *rdata = ret.getData<float>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ for (unsigned int c = 0; c < dim[2]; ++c) {
+ unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
+ unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
+ sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
+ ones.getData<float>(), 1, beta, &rdata[ridx], 1);
+ }
+ }
+ } else {
+ unsigned int m = ret.dim.getDataLen();
+ unsigned int n = dim[3];
+ Tensor ones(1, 1, 1, n);
+ ones.setValue(alpha);
+ sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n, ones.getData(), 1,
+ beta, ret.getData(), 1);
+ }
+ } break;
+ default:
+ throw std::out_of_range("Error: Dimension cannot exceed 3");
+ }
+ return ret;
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
+
+ NNTR_THROW_IF(!contiguous, std::invalid_argument)
+ << getName() << " is not contiguous, cannot sum";
+
+ if (axis >= 4)
+ throw std::out_of_range("Error: axis is invalid");
+
+ if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
+ CREATE_IF_EMPTY_DIMS(ret, dim);
+ ret.copy(this->getData<__fp16>());
+ return ret;
}
- } break;
- case 3: {
- CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1, this->getTensorType());
- if (this->getFormat() == Tformat::NHWC) {
- unsigned int t_3 = dim[1];
- unsigned int t_axis = dim[3];
- Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+
+ switch (axis) {
+ case 0: {
+ CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
+ this->getTensorType());
+ size_t feat_len = dim.getFeatureLen();
+ size_t batch = dim.batch();
+ Tensor ones(1, 1, 1, batch, this->getFormat());
ones.setValue(alpha);
- float *rdata = ret.getData<float>();
- for (unsigned int k = 0; k < dim[0]; ++k) {
- for (unsigned int c = 0; c < dim[2]; ++c) {
- unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
- unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
- sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
- ones.getData<float>(), 1, beta, &rdata[ridx], 1);
+ sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
+ ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
+ } break;
+ case 1: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int m = ret.dim.getDataLen();
+ unsigned int n = dim[1];
+ Tensor ones(1, 1, 1, n, this->getTensorType());
+ ones.setValue(alpha);
+ sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
+ ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
+ } else {
+ unsigned int feat_len = dim[2] * dim[3];
+ unsigned int t_axis = dim[1];
+ Tensor ones(1, 1, 1, t_axis, getTensorType());
+ ones.setValue(alpha);
+ __fp16 *rdata = ret.getData<__fp16>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
+ &data[k * dim.getFeatureLen()], feat_len,
+ ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
}
}
- } else {
- unsigned int m = ret.dim.getDataLen();
- unsigned int n = dim[3];
- Tensor ones(1, 1, 1, n);
- ones.setValue(alpha);
- sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n, ones.getData(), 1,
- beta, ret.getData(), 1);
+ } break;
+ case 2: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
+
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int feat_len = dim[1] * dim[3];
+ unsigned int t_axis = dim[2];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ __fp16 *rdata = ret.getData<__fp16>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
+ &data[k * dim.getFeatureLen()], feat_len,
+ ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
+ }
+ } else {
+ unsigned int t_3 = dim[3];
+ unsigned int t_axis = dim[2];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ __fp16 *rdata = ret.getData<__fp16>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ for (unsigned int c = 0; c < dim[1]; ++c) {
+ unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
+ unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
+ sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
+ ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
+ }
+ }
+ }
+ } break;
+ case 3: {
+ CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
+ this->getTensorType());
+ if (this->getFormat() == Tformat::NHWC) {
+ unsigned int t_3 = dim[1];
+ unsigned int t_axis = dim[3];
+ Tensor ones(1, 1, 1, t_axis, this->getTensorType());
+ ones.setValue(alpha);
+ __fp16 *rdata = ret.getData<__fp16>();
+ for (unsigned int k = 0; k < dim[0]; ++k) {
+ for (unsigned int c = 0; c < dim[2]; ++c) {
+ unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
+ unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
+ sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
+ ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
+ }
+ }
+ } else {
+ unsigned int m = ret.dim.getDataLen();
+ unsigned int n = dim[3];
+ Tensor ones(1, 1, 1, n);
+ ones.setValue(alpha);
+ sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
+ ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
+ }
+ } break;
+ default:
+ throw std::out_of_range("Error: Dimension cannot exceed 3");
}
- } break;
- default:
- throw std::out_of_range("Error: Dimension cannot exceed 3");
+ return ret;
}
- return ret;
}
Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
getTensorType());
}
- // We are not set zero the result because of performnace reason.
+ // We are not set zero the result because of performance reason.
// However, result is not initialized properly. There might include
// garbage like nan. When we have to use this value as in C = alpha*A*B +
- // beta*C, then have to check gargabe data of C is not effect or not.
+ // beta*C, then have to check garbage data of C is not effect or not.
} else if (!trans && trans_m) {
if (dim2 != mdim2)
N = mdim1;
M = dim2;
if (getFormat() == Tformat::NHWC) {
- CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
+ CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
} else {
CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
}
ldb = mdim2;
ldc = (getFormat() == Tformat::NHWC) ? result.channel() : result.width();
- const float *data = getData();
- const float *mdata = m.getData();
- float *rdata = result.getData();
- const float alpha = 1.0f;
- enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
- enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
-
- /// shortcut handling in case of vector
- /// for vector, (1 * K) == (K * 1) in current memory layout...
- /// and plaese note that N, K, M is a fixed place holder after considering
- /// transpose.
- /// For example, there is no case like (1 * K) X (1 * K) while
- /// (1 * K) X (1 * M) can be a case
- /// case1: (1 * K) X (K * 1)
- if (M == 1 && N == 1) {
- *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
- }
- /// case2: (M * K) X (K * 1)
- else if (N == 1) {
- sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
- rdata, 1);
- }
- /// case3: (1 * K) X (K * N) = 1 * N = R
- /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
- /// Effectively a translation of sgemv
- else if (M == 1) {
- transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
- sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1, beta,
- rdata, 1);
- }
- /// case others: use gemm
- else {
- sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata, ldb,
- beta, rdata, ldc);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData();
+ const float *mdata = m.getData();
+ float *rdata = result.getData();
+ const float alpha = 1.0f;
+ enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
+ enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
+
+ /// shortcut handling in case of vector
+ /// for vector, (1 * K) == (K * 1) in current memory layout...
+ /// and plaese note that N, K, M is a fixed place holder after considering
+ /// transpose.
+ /// For example, there is no case like (1 * K) X (1 * K) while
+ /// (1 * K) X (1 * M) can be a case
+ /// case1: (1 * K) X (K * 1)
+ if (M == 1 && N == 1) {
+ *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
+ }
+ /// case2: (M * K) X (K * 1)
+ else if (N == 1) {
+ sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
+ rdata, 1);
+ }
+ /// case3: (1 * K) X (K * N) = 1 * N = R
+ /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
+ /// Effectively a translation of sgemv
+ else if (M == 1) {
+ transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
+ sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
+ beta, rdata, 1);
+ }
+ /// case others: use gemm
+ else {
+ sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
+ ldb, beta, rdata, ldc);
+ }
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
+ const __fp16 *mdata = m.getData<__fp16>();
+ __fp16 *rdata = result.getData<__fp16>();
+ const __fp16 alpha = 1.0f;
+ enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
+ enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
+
+ /// shortcut handling in case of vector
+ /// for vector, (1 * K) == (K * 1) in current memory layout...
+ /// and plaese note that N, K, M is a fixed place holder after considering
+ /// transpose.
+ /// For example, there is no case like (1 * K) X (1 * K) while
+ /// (1 * K) X (1 * M) can be a case
+ /// case1: (1 * K) X (K * 1)
+ if (M == 1 && N == 1) {
+ *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
+ }
+ /// case2: (M * K) X (K * 1)
+ else if (N == 1) {
+ sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
+ rdata, 1);
+ }
+ /// case3: (1 * K) X (K * N) = 1 * N = R
+ /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
+ /// Effectively a translation of sgemv
+ else if (M == 1) {
+ transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
+ sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
+ beta, rdata, 1);
+ }
+ /// case others: use gemm
+ else {
+ sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
+ ldb, beta, rdata, ldc);
+ }
}
return result;
}
unsigned int SL, SI, SJ, SK;
- const float *inptr;
- float *outptr;
out.reshape(dim.transpose(direction));
bool is_format_nchw = (getFormat() == Tformat::NCHW);
- inptr = getData();
- outptr = out.getData();
-
- switch (indexI) {
- case 0:
- if (indexJ == 1) {
- if (is_format_nchw) {
- transposeloop(l, i, j, k, SL, SI, SJ, SK);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *inptr = getData();
+ float *outptr = out.getData();
+ switch (indexI) {
+ case 0:
+ if (indexJ == 1) {
+ if (is_format_nchw) {
+ transposeloop(l, i, j, k, SL, SI, SJ, SK);
+ } else {
+ transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
+ }
} else {
- transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
+ if (is_format_nchw) {
+ transposeloop(l, i, k, j, SL, SI, SK, SJ);
+ } else {
+ transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
+ }
}
- } else {
- if (is_format_nchw) {
- transposeloop(l, i, k, j, SL, SI, SK, SJ);
+ break;
+ case 1:
+ if (indexJ == 0) {
+ if (is_format_nchw) {
+ transposeloop(l, j, i, k, SL, SJ, SI, SK);
+ } else {
+ transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
+ }
} else {
- transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
+ if (is_format_nchw) {
+ transposeloop(l, j, k, i, SL, SJ, SK, SI);
+ } else {
+ transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
+ }
}
- }
- break;
- case 1:
- if (indexJ == 0) {
- if (is_format_nchw) {
- transposeloop(l, j, i, k, SL, SJ, SI, SK);
+ break;
+ case 2:
+ if (indexJ == 0) {
+ if (is_format_nchw) {
+ transposeloop(l, k, i, j, SL, SK, SI, SJ);
+ } else {
+ transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
+ }
} else {
- transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
+ if (is_format_nchw) {
+ transposeloop(l, k, j, i, SL, SK, SJ, SI);
+ } else {
+ transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
+ }
}
- } else {
- if (is_format_nchw) {
- transposeloop(l, j, k, i, SL, SJ, SK, SI);
+ break;
+ }
+ } else {
+ const __fp16 *inptr = getData<__fp16>();
+ __fp16 *outptr = out.getData<__fp16>();
+ switch (indexI) {
+ case 0:
+ if (indexJ == 1) {
+ if (is_format_nchw) {
+ transposeloop(l, i, j, k, SL, SI, SJ, SK);
+ } else {
+ transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
+ }
} else {
- transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
+ if (is_format_nchw) {
+ transposeloop(l, i, k, j, SL, SI, SK, SJ);
+ } else {
+ transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
+ }
}
- }
- break;
- case 2:
- if (indexJ == 0) {
- if (is_format_nchw) {
- transposeloop(l, k, i, j, SL, SK, SI, SJ);
+ break;
+ case 1:
+ if (indexJ == 0) {
+ if (is_format_nchw) {
+ transposeloop(l, j, i, k, SL, SJ, SI, SK);
+ } else {
+ transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
+ }
} else {
- transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
+ if (is_format_nchw) {
+ transposeloop(l, j, k, i, SL, SJ, SK, SI);
+ } else {
+ transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
+ }
}
- } else {
- if (is_format_nchw) {
- transposeloop(l, k, j, i, SL, SK, SJ, SI);
+ break;
+ case 2:
+ if (indexJ == 0) {
+ if (is_format_nchw) {
+ transposeloop(l, k, i, j, SL, SK, SI, SJ);
+ } else {
+ transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
+ }
} else {
- transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
+ if (is_format_nchw) {
+ transposeloop(l, k, j, i, SL, SK, SJ, SI);
+ } else {
+ transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
+ }
}
+ break;
}
- break;
}
return out;
void Tensor::dropout_mask(float dropout) {
setRandUniform(0.0, 1.0);
- float scale = 1.0 / (1 - dropout);
- float *data_ = getData();
-
- for (unsigned int i = 0; i < size(); ++i) {
- if (data_[i] >= dropout)
- data_[i] = scale;
- else
- data_[i] = 0.0;
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ float scale = 1.0 / (1 - dropout);
+ float *data_ = getData();
+ for (unsigned int i = 0; i < size(); ++i) {
+ if (data_[i] >= dropout)
+ data_[i] = scale;
+ else
+ data_[i] = 0.0;
+ }
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ __fp16 scale = 1.0 / (1 - dropout);
+ __fp16 *data_ = getData<__fp16>();
+ for (unsigned int i = 0; i < size(); ++i) {
+ if (data_[i] >= dropout)
+ data_[i] = scale;
+ else
+ data_[i] = 0.0;
+ }
}
}
setValue(fill_mask_val);
if (mask_len.batch() != batch())
throw std::invalid_argument("Number of filter masks mismatched");
-
- for (unsigned int b = 0; b < batch(); b++) {
- float *addr = getAddress(b, 0, 0, 0);
- const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
- std::fill(addr, addr + (*mask_len_val), en_mask_val);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ for (unsigned int b = 0; b < batch(); b++) {
+ float *addr = getAddress(b, 0, 0, 0);
+ const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
+ std::fill(addr, addr + (*mask_len_val), en_mask_val);
+ }
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ for (unsigned int b = 0; b < batch(); b++) {
+ __fp16 *addr = getAddress<__fp16>(b, 0, 0, 0);
+ const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
+ std::fill(addr, addr + (*mask_len_val), (__fp16)en_mask_val);
+ }
}
}
}
opposite.setRandBernoulli(zoneout);
- float *data = getData();
- float *opposite_data = opposite.getData();
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ float *data = getData();
+ float *opposite_data = opposite.getData();
- for (unsigned int i = 0; i < size(); ++i) {
- if (opposite_data[i] > epsilon) {
- data[i] = 0.0f;
- } else {
- data[i] = 1.0f;
+ for (unsigned int i = 0; i < size(); ++i) {
+ if (opposite_data[i] > epsilon) {
+ data[i] = 0.0f;
+ } else {
+ data[i] = 1.0f;
+ }
+ }
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ __fp16 *data = getData<__fp16>();
+ __fp16 *opposite_data = opposite.getData<__fp16>();
+
+ for (unsigned int i = 0; i < size(); ++i) {
+ if (opposite_data[i] > epsilon) {
+ data[i] = (__fp16)0.0;
+ } else {
+ data[i] = (__fp16)1.0;
+ }
}
}
}
out << "-------" << std::endl;
}
out.copyfmt(init);
- }
+ }
}
}
void Tensor::copy_with_stride(const Tensor &from) {
if (dim == from.getDim()) {
- for (unsigned int b = 0; b < batch(); ++b) {
- for (unsigned int c = 0; c < channel(); ++c) {
- for (unsigned int h = 0; h < height(); ++h) {
- for (unsigned int w = 0; w < width(); ++w) {
- setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ for (unsigned int b = 0; b < batch(); ++b) {
+ for (unsigned int c = 0; c < channel(); ++c) {
+ for (unsigned int h = 0; h < height(); ++h) {
+ for (unsigned int w = 0; w < width(); ++w) {
+ setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
+ }
+ }
+ }
+ }
+ } else {
+ for (unsigned int b = 0; b < batch(); ++b) {
+ for (unsigned int c = 0; c < channel(); ++c) {
+ for (unsigned int h = 0; h < height(); ++h) {
+ for (unsigned int w = 0; w < width(); ++w) {
+ setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
+ }
}
}
}
}
} else {
Tensor t = Tensor(from.getDim(), true);
- for (unsigned int b = 0; b < t.batch(); ++b) {
- for (unsigned int c = 0; c < t.channel(); ++c) {
- for (unsigned int h = 0; h < t.height(); ++h) {
- for (unsigned int w = 0; w < t.width(); ++w) {
- t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
+ if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ for (unsigned int b = 0; b < t.batch(); ++b) {
+ for (unsigned int c = 0; c < t.channel(); ++c) {
+ for (unsigned int h = 0; h < t.height(); ++h) {
+ for (unsigned int w = 0; w < t.width(); ++w) {
+ t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
+ }
+ }
+ }
+ }
+ } else {
+ for (unsigned int b = 0; b < batch(); ++b) {
+ for (unsigned int c = 0; c < channel(); ++c) {
+ for (unsigned int h = 0; h < height(); ++h) {
+ for (unsigned int w = 0; w < width(); ++w) {
+ setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
+ }
}
}
}
if (size() != from.size())
throw std::invalid_argument("Size of tensor to copy must match");
+
+ if (getDataType() != from.getDataType())
+ throw std::invalid_argument("Data type of tensor to copy must match");
+
copy(from.getData());
}
}
if (strides != from.getStrides()) {
- /// @todo length does not represent buffer size, there should be way to get
- /// the buffer size
+ /// @todo length does not represent buffer size, there should be way to
+ /// get the buffer size
throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
}
return this->average(output);
TensorDim ret_shape(getTensorType());
-
+
for (const auto &idx : axes) {
if (idx >= TensorDim::MAXDIM) {
throw std::out_of_range("axis more then MAXDIM is invalid");
void Tensor::setValue(float val) {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot set value.";
-
- float *data = getData<float>();
- std::fill(data, data + size(), val);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ float *data = getData<float>();
+ std::fill(data, data + size(), val);
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ __fp16 *data = getData<__fp16>();
+ std::fill(data, data + size(), (__fp16)val);
+ }
}
void Tensor::setZero() {
std::vector<unsigned int> Tensor::argmax() const {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot get argmax.";
-
- const float *data = getData();
std::vector<unsigned int> result;
- size_t batch_size = batch();
- size_t feature_len = dim.getFeatureLen();
- result.resize(batch_size);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData();
+ size_t batch_size = batch();
+ size_t feature_len = dim.getFeatureLen();
+
+ result.resize(batch_size);
- for (unsigned int b = 0; b < batch_size; b++) {
- auto max_iter =
- std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
- result[b] = std::distance(data, max_iter) - (b * feature_len);
+ for (unsigned int b = 0; b < batch_size; b++) {
+ auto max_iter =
+ std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
+ result[b] = std::distance(data, max_iter) - (b * feature_len);
+ }
+ }
+ if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
+ size_t batch_size = batch();
+ size_t feature_len = dim.getFeatureLen();
+
+ result.resize(batch_size);
+
+ for (unsigned int b = 0; b < batch_size; b++) {
+ auto max_iter =
+ std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
+ result[b] = std::distance(data, max_iter) - (b * feature_len);
+ }
}
return result;
<< getName() << " is not contiguous, cannot get l2norm.";
unsigned int len = size();
- const float *data = getData();
-
- return snrm2(len, data, 1);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData<float>();
+ return snrm2(len, data, 1);
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
+ return snrm2(len, data, 1);
+ }
}
float Tensor::max_abs() const {
<< getName() << " is not contiguous, cannot get max_abs.";
unsigned int len = size();
- const float *data = getData();
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData<float>();
+
+ unsigned int idx = isamax(len, data, 1);
+ return *(data + idx);
+
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
- unsigned int idx = isamax(len, data, 1);
- return *(data + idx);
+ unsigned int idx = isamax(len, data, 1);
+ return *(data + idx);
+ }
}
Tensor &Tensor::normalization(Tensor &output) const {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot do normalization.";
- const float *data = getData();
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = getData();
- auto bounds = std::minmax_element(data, data + size());
- const float min = *bounds.first;
- const float max = *bounds.second;
+ auto bounds = std::minmax_element(data, data + size());
+ const float min = *bounds.first;
+ const float max = *bounds.second;
- if (max == min) {
- Tensor tmp = *this;
- this->subtract_i(tmp);
- } else {
- this->subtract_i(min);
- this->divide_i(max - min);
+ if (max == min) {
+ Tensor tmp = *this;
+ this->subtract_i(tmp);
+ } else {
+ this->subtract_i(min);
+ this->divide_i(max - min);
+ }
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const __fp16 *data = getData<__fp16>();
+
+ auto bounds = std::minmax_element(data, data + size());
+ const __fp16 min = *bounds.first;
+ const __fp16 max = *bounds.second;
+
+ if (max == min) {
+ Tensor tmp = *this;
+ this->subtract_i(tmp);
+ } else {
+ this->subtract_i(min);
+ this->divide_i(max - min);
+ }
}
}
mean_by_batch.divide_i(dim.getFeatureLen());
this->subtract_i(mean_by_batch);
+ if (getDataType() == ml::train::TensorDim::DataType::FP32) {
+ Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
+ std_dev_by_batch.setZero();
+ float *std_dev = std_dev_by_batch.getData();
+
+ for (unsigned int k = 0; k < dim.batch(); ++k) {
+ Tensor sub_this = this->getBatchSlice(k, 1);
+ std_dev[k] = sub_this.l2norm();
+ }
- Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
- std_dev_by_batch.setZero();
- float *std_dev = std_dev_by_batch.getData();
+ std_dev_by_batch.divide_i(dim.getFeatureLen());
+ this->divide_i(std_dev_by_batch);
+ } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+ Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
+ std_dev_by_batch.setZero();
+ __fp16 *std_dev = std_dev_by_batch.getData<__fp16>();
- for (unsigned int k = 0; k < dim.batch(); ++k) {
- Tensor sub_this = this->getBatchSlice(k, 1);
- std_dev[k] = sub_this.l2norm();
- }
+ for (unsigned int k = 0; k < dim.batch(); ++k) {
+ Tensor sub_this = this->getBatchSlice(k, 1);
+ std_dev[k] = sub_this.l2norm();
+ }
- std_dev_by_batch.divide_i(dim.getFeatureLen());
- this->divide_i(std_dev_by_batch);
+ std_dev_by_batch.divide_i(dim.getFeatureLen());
+ this->divide_i(std_dev_by_batch);
+ }
}
Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
Tensor Tensor::rotate_180(Tensor in) {
Tensor output(in.getDim());
- output.setZero();
- for (unsigned int i = 0; i < in.batch(); ++i) {
- for (unsigned int j = 0; j < in.channel(); ++j) {
- for (unsigned int k = 0; k < in.height(); ++k) {
- for (unsigned int l = 0; l < in.width(); ++l) {
- output.setValue(i, j, k, l,
- in.getValue<float>(i, j, (in.height() - k - 1),
- (in.width() - l - 1)));
+ if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ output.setZero();
+ for (unsigned int i = 0; i < in.batch(); ++i) {
+ for (unsigned int j = 0; j < in.channel(); ++j) {
+ for (unsigned int k = 0; k < in.height(); ++k) {
+ for (unsigned int l = 0; l < in.width(); ++l) {
+ output.setValue(i, j, k, l,
+ in.getValue<float>(i, j, (in.height() - k - 1),
+ (in.width() - l - 1)));
+ }
}
}
}
+ return output;
+ } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ output.setZero();
+ for (unsigned int i = 0; i < in.batch(); ++i) {
+ for (unsigned int j = 0; j < in.channel(); ++j) {
+ for (unsigned int k = 0; k < in.height(); ++k) {
+ for (unsigned int l = 0; l < in.width(); ++l) {
+ output.setValue(i, j, k, l,
+ in.getValue<__fp16>(i, j, (in.height() - k - 1),
+ (in.width() - l - 1)));
+ }
+ }
+ }
+ }
+ return output;
}
- return output;
}
} /* namespace nntrainer */