2 * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * http://www.apache.org/licenses/LICENSE-2.0
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
16 * @date 04 December 2019
17 * @brief This is Tensor class for calculation
18 * @see https://github.com/nnstreamer/nntrainer
19 * @author Jijoong Moon <jijoong.moon@samsung.com>
20 * @bug No known bugs except for NYI items
39 #include <blas_interface.h>
40 #include <lazy_tensor.h>
41 #include <nntrainer_error.h>
42 #include <nntrainer_log.h>
44 #include <util_func.h>
46 #define transposeloop(cl, ci, cj, ck, sl, si, sj, sk) \
48 unsigned int i, j, k, l; \
49 int inidx = 0, outidx = 0; \
50 for (cl = 0; cl < sl; cl++) \
51 for (ci = 0; ci < si; ci++) \
52 for (cj = 0; cj < sj; cj++) \
53 for (ck = 0; ck < sk; ck++) { \
54 outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
55 inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k; \
56 outptr[outidx] = inptr[inidx]; \
60 #define CREATE_IF_EMPTY_DIMS(tensor, ...) \
63 tensor = Tensor(__VA_ARGS__); \
68 * @struct External Loop Info for broadcasted info
69 * @brief External Loop Info for broadcasted iteration. Please refer to
70 * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
71 * @note This should better be implemented in iterator fashion before used
74 struct Tensor::BroadcastInfo {
77 * @brief Construct a new External Loop Info object
80 BroadcastInfo() : buffer_size(0), buffer_axis(-1), strides{0, 0, 0, 0} {}
82 unsigned int buffer_size; /**< virtual size of the buffer */
83 int buffer_axis; /**< the smallest axis that should be looped.
84 -1 means no loop needed*/
85 std::array<unsigned int, TensorDim::MAXDIM>
86 strides; /**< modified strides for the loop */
89 static auto rng = [] {
95 Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
98 if (d.getDataLen() != 0) {
100 strides = d.computeStrides();
108 Tensor::Tensor(const TensorDim &d, const float *buf) : Tensor(d, true) {
109 if (d.getDataLen() != 0) {
116 * @class SrcSharedTensor
117 * @brief Source of the shared tensor
119 class SrcSharedTensor {
122 * @brief Constructor for the class
124 SrcSharedTensor() : src(nullptr), off(0) {}
126 SrcSharedTensor(const Tensor *tensor, unsigned int offset) :
131 * @brief Get the allocated src tensor
133 const Tensor *tensor() const {
135 throw std::runtime_error("Accessing empty src tensor");
141 * @brief Get the offset from the source tensor
143 unsigned int offset() const { return off; }
146 const Tensor *src; /**< Tensor of the source */
147 unsigned int off; /**< offset from the source data ptr */
150 void Tensor::allocate() {
152 /// already allocated
156 /// allocate data based on the source tensor
157 data = src_tensor->tensor()->data;
158 offset = src_tensor->tensor()->offset + src_tensor->offset();
159 /** as this memory is shared, do NOT initialize */
161 /// allocate new memory for the tensor data
162 auto mem_data = new MemoryData<float>(new float[dim.getDataLen()]);
163 data = std::shared_ptr<MemoryData<float>>(mem_data, [](auto *mem_data) {
164 delete[] mem_data->getAddr();
172 Tensor Tensor::Map(float *buf, unsigned int bytes, const TensorDim &d,
174 if (d.getDataLen() == 0 || buf == nullptr) {
175 throw std::invalid_argument(
176 "[Tensor::Map] empty tensor dim is not allowed");
179 if (d.getDataLen() * sizeof(float) + offset > bytes) {
180 throw std::invalid_argument(
181 "Creating shared tensor of size bigger than tensor memory.");
186 tmp.strides = d.computeStrides();
187 /// Tensor does not own the memory
188 tmp.data = std::shared_ptr<MemoryData<float>>(
189 new MemoryData<float>(buf), std::default_delete<MemoryData<float>>());
195 bool Tensor::operator==(const Tensor &rhs) const {
196 if (this->dim != rhs.dim)
201 if (len != rhs.size())
204 const float *data = getData();
205 const float *rdata = rhs.getData();
207 if (contiguous != rhs.contiguous)
210 if (strides != rhs.strides)
213 for (size_t i = 0; i < len; ++i) {
214 /** not checking sign change is intentional to avoid float calculation
216 if ((std::isnan(data[i]) && !std::isnan(rdata[i])) ||
217 (!std::isnan(data[i]) && std::isnan(rdata[i])) ||
218 std::fabs(data[i] - rdata[i]) > epsilon)
225 template <typename T> void Tensor::setDist(T dist) {
226 NNTR_THROW_IF(!contiguous, std::invalid_argument)
227 << getName() << " Tensor is not contiguous, cannot set distribution";
229 float *data = getData();
230 unsigned int len = size();
231 for (unsigned int i = 0; i < len; ++i) {
236 void Tensor::setRandNormal(float mean, float std) {
237 setDist<std::normal_distribution<float>>(
238 std::normal_distribution<float>(mean, std));
241 void Tensor::setRandUniform(float min, float max) {
242 setDist<std::uniform_real_distribution<float>>(
243 std::uniform_real_distribution<float>(min, max));
246 void Tensor::setRandBernoulli(float probability) {
247 setDist<std::bernoulli_distribution>(
248 std::bernoulli_distribution(probability));
251 void Tensor::initialize() {
252 if (empty() || !isAllocated())
255 unsigned int fan_in, fan_out;
257 /// @fixme: when unit is equal to one, this does not work, we need to rely on
258 /// effective dimension then actual numbers here. For now, some heuristics
259 /// added to infer what would be fan_in/fan_out
260 if (dim.batch() * dim.channel() * dim.height() == 1) {
261 fan_out = fan_in = dim.width();
262 } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
263 fan_in = dim.height();
264 fan_out = dim.width();
265 } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
266 auto field_size = dim.height() * dim.width();
268 // this also handles below cases.
269 // 1. fan_in = fan_out = 1 as well.
270 // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
271 fan_in = dim.channel() * field_size;
272 fan_out = dim.batch() * field_size;
275 switch (initializer) {
276 case Tensor::Initializer::ZEROS:
279 case Tensor::Initializer::ONES:
282 case Tensor::Initializer::LECUN_NORMAL:
283 setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
285 case Tensor::Initializer::XAVIER_NORMAL:
286 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
288 case Tensor::Initializer::HE_NORMAL:
289 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
291 case Tensor::Initializer::LECUN_UNIFORM:
292 setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
294 case Tensor::Initializer::XAVIER_UNIFORM:
295 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
296 sqrtFloat(6.0 / (fan_in + fan_out)));
298 case Tensor::Initializer::HE_UNIFORM:
299 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
300 sqrtFloat(6.0 / (fan_in)));
310 std::vector<std::vector<std::vector<std::vector<float>>>> const &d) {
312 if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
313 throw std::out_of_range(
314 "[Tensor] trying to initialize Tensor from empty vector");
318 dim.channel(d[0].size());
319 dim.height(d[0][0].size());
320 dim.width(d[0][0][0].size());
321 strides = dim.computeStrides();
322 auto mem_data = new MemoryData<float>(new float[dim.getDataLen()]);
323 data = std::shared_ptr<MemoryData<float>>(
324 mem_data, [](auto *mem_data) { delete[] mem_data->getAddr(); });
327 initializer = Initializer::NONE;
329 for (unsigned int i = 0; i < dim.batch(); ++i)
330 for (unsigned int j = 0; j < dim.channel(); ++j)
331 for (unsigned int k = 0; k < dim.height(); ++k)
332 for (unsigned int l = 0; l < dim.width(); ++l)
333 this->setValue(i, j, k, l, d[i][j][k][l]);
336 int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
338 this->multiply_strided(m, *this, beta);
339 } catch (std::exception &err) {
340 ml_loge("%s %s", typeid(err).name(), err.what());
341 return ML_ERROR_INVALID_PARAMETER;
344 return ML_ERROR_NONE;
347 Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
349 return this->multiply_strided(m, t, beta);
352 Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
353 const float beta) const {
354 /** TODO: throw than create new dimenions */
355 CREATE_IF_EMPTY_DIMS(output, dim);
357 if (size() != m.size() || size() != output.size())
358 throw std::invalid_argument(
359 "Strided multiplication does not support broadcasting");
361 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
363 for (unsigned int b = 0; b < batch(); ++b) {
364 for (unsigned int c = 0; c < channel(); ++c) {
365 for (unsigned int h = 0; h < height(); ++h) {
366 for (unsigned int w = 0; w < width(); ++w) {
368 b, c, h, w, getValue(b, c, h, w) * m.getValue(b, c, h, w), beta);
374 /** @todo optimize this with combining these loops where stride is 1 */
375 for (unsigned int b = 0; b < batch(); ++b) {
376 for (unsigned int c = 0; c < channel(); ++c) {
377 for (unsigned int h = 0; h < height(); ++h) {
378 float *out_data = output.getAddress(b, c, h, 0);
379 const float *m_data = m.getAddress(b, c, h, 0);
380 const float *in_data = getAddress(b, c, h, 0);
381 std::transform(in_data, in_data + width(), m_data, out_data,
382 std::multiplies<float>());
391 int Tensor::add_i_strided(Tensor const &m, const float beta) {
393 this->add_strided(m, *this, beta);
394 } catch (std::exception &err) {
395 ml_loge("%s %s", typeid(err).name(), err.what());
396 return ML_ERROR_INVALID_PARAMETER;
399 return ML_ERROR_NONE;
402 Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
404 return this->add_strided(m, t, beta);
407 Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
408 const float beta) const {
409 /** TODO: throw than create new dimenions */
410 CREATE_IF_EMPTY_DIMS(output, dim);
412 if (size() != m.size() || size() != output.size())
413 throw std::invalid_argument(
414 "Strided addition does not support broadcasting");
416 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
418 for (unsigned int b = 0; b < batch(); ++b) {
419 for (unsigned int c = 0; c < channel(); ++c) {
420 for (unsigned int h = 0; h < height(); ++h) {
421 for (unsigned int w = 0; w < width(); ++w) {
423 b, c, h, w, getValue(b, c, h, w) + m.getValue(b, c, h, w) * beta);
429 /** @todo optimize this with combining these loops where stride is 1 */
430 for (unsigned int b = 0; b < batch(); ++b) {
431 for (unsigned int c = 0; c < channel(); ++c) {
432 for (unsigned int h = 0; h < height(); ++h) {
433 float *out_data = output.getAddress(b, c, h, 0);
434 const float *m_data = m.getAddress(b, c, h, 0);
435 const float *in_data = getAddress(b, c, h, 0);
436 std::transform(in_data, in_data + width(), m_data, out_data,
446 int Tensor::multiply_i(float const &value) {
447 NNTR_THROW_IF(!contiguous, std::invalid_argument)
448 << getName() << " is not contiguous, cannot multiply";
450 /// @note this is not depending on multiply_i as there is an optimized
451 /// version for multiply_i
452 float *data = getData();
453 unsigned int len = size();
455 sscal(len, value, data, 1);
456 return ML_ERROR_NONE;
459 Tensor Tensor::multiply(float const &value) const {
461 return multiply(value, t);
464 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
465 /// @todo add unittest
466 auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
467 return apply(f, out);
470 int Tensor::multiply_i(Tensor const &m, const float beta) {
472 this->multiply(m, *this, beta);
473 } catch (std::exception &err) {
474 ml_loge("%s %s", typeid(err).name(), err.what());
475 return ML_ERROR_INVALID_PARAMETER;
478 return ML_ERROR_NONE;
481 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
483 return this->multiply(m, t, beta);
486 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
487 const float beta) const {
489 * @note this does not work correctly with differently strided inputs.
490 * Use multiply_strided alternatively
492 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
494 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
496 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
497 std::multiplies<float>());
499 for (unsigned int i = 0; i < e.buffer_size; ++i) {
500 *out_buf = *buf * *m_buf + beta * *out_buf;
502 m_buf += e.strides[3];
503 out_buf += output.strides[3];
508 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
509 std::invalid_argument)
510 << getName() << " is not contiguous, cannot multiply";
512 apply_broadcast(m, f, output);
516 int Tensor::divide_i(float const &value) {
518 return ML_ERROR_INVALID_PARAMETER;
520 this->divide(value, *this);
521 return ML_ERROR_NONE;
524 Tensor Tensor::divide(float const &value) const {
526 return divide(value, t);
529 Tensor &Tensor::divide(float const &value, Tensor &out) const {
530 auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
531 /// @todo add unittest
533 std::stringstream ss;
534 ss << "[Tensor] divide by value failed, value: " << value;
535 throw std::invalid_argument(ss.str().c_str());
537 return apply(f, out);
540 int Tensor::divide_i(Tensor const &m) {
542 this->divide(m, *this);
543 } catch (std::exception &err) {
544 ml_loge("%s %s", typeid(err).name(), err.what());
545 return ML_ERROR_INVALID_PARAMETER;
548 return ML_ERROR_NONE;
551 Tensor Tensor::divide(Tensor const &m) const {
553 return this->divide(m, t);
556 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
557 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
559 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
560 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
561 std::divides<float>());
563 for (unsigned int i = 0; i < e.buffer_size; ++i) {
564 *out_buf = *buf / *m_buf;
566 m_buf += e.strides[3];
567 out_buf += output.strides[3];
572 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
573 std::invalid_argument)
574 << getName() << " is not contiguous, cannot divide";
576 apply_broadcast(m, f, output);
580 int Tensor::add_i(float const &value) {
581 this->add(value, *this);
582 return ML_ERROR_NONE;
585 Tensor Tensor::add(float const &value) const {
587 return add(value, t);
590 Tensor &Tensor::add(float const &value, Tensor &out) const {
591 /// @todo add unittest
592 auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
593 return apply(f, out);
596 int Tensor::add_i(Tensor const &m, float const alpha) {
597 /// @todo: add axis rather doing add over the last two dimensions always
598 /// operator i has optimized version
599 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
601 saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
604 /// @todo: enable this after add_strided supports broadcast
605 // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
606 // << getName() << " is not contiguous, cannot add";
609 apply_broadcast(m, f, *this);
610 } catch (std::exception &err) {
611 ml_loge("%s %s", typeid(err).name(), err.what());
612 return ML_ERROR_INVALID_PARAMETER;
615 return ML_ERROR_NONE;
618 Tensor Tensor::add(Tensor const &m, float const alpha) const {
620 return this->add(m, t, alpha);
623 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
624 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
626 if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) {
627 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
630 for (unsigned int i = 0; i < e.buffer_size; ++i) {
631 *out_buf = *buf + *m_buf * alpha;
633 m_buf += e.strides[3];
634 out_buf += strides[3];
639 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
640 std::invalid_argument)
641 << getName() << " is not contiguous, cannot add";
643 apply_broadcast(m, f, output);
648 int Tensor::subtract_i(float const &value) {
649 this->subtract(value, *this);
650 return ML_ERROR_NONE;
653 Tensor Tensor::subtract(float const &value) const {
655 return subtract(value, t);
658 Tensor &Tensor::subtract(float const &value, Tensor &out) const {
659 /// @todo add unittest
660 auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
661 return apply(f, out);
664 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
666 Tensor Tensor::subtract(Tensor const &m) const { return add(m, -1); }
668 Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
669 return add(m, out, -1);
672 int Tensor::pow_i(float exponent) {
673 pow(exponent, *this);
674 return ML_ERROR_NONE;
677 Tensor Tensor::pow(float exponent) const {
679 return pow(exponent, t);
682 Tensor &Tensor::pow(float exponent, Tensor &out) const {
683 auto f = [exponent](float in) { return powf(in, exponent); };
684 return apply(f, out);
687 Tensor Tensor::getBatchSlice(unsigned int offset, unsigned int size) const {
688 TensorDim dim_ = dim;
691 return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
694 void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
695 unsigned int offset) {
697 * - If src already has data allocaed, then directly make dest tensor based on
699 * - If src.data does not exist (meaning tensor does not memory allocated),
700 * and src.src_tensor does not exist (meaning the src tensor does not depened
701 * on another tensor), then create a SrcSharedTensor around the src.
702 * - If src.src_tensor exists, then use the src.src_tensor to create the
703 * required SrcSharedTensor to avoid recursive dependency.
705 * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
706 * if the batch size of src is updated and needs reallocation.
710 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
712 } else if (!src.src_tensor)
713 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
715 dest.src_tensor = std::make_shared<SrcSharedTensor>(
716 src.src_tensor->tensor(), offset + src.src_tensor->offset());
719 Tensor Tensor::getSharedDataTensor(const TensorDim dim_, unsigned int offset,
721 const std::string &name_) const {
727 if (dim_.getDataLen() + offset > dim.getDataLen())
728 throw std::invalid_argument(
729 "Creating shared tensor of size bigger than tensor memory.");
732 ret.strides = ret.dim.computeStrides();
734 TensorDim new_match_dim = dim_;
735 new_match_dim.batch(dim.batch());
736 if (new_match_dim != dim && !reset_stride)
737 ret.contiguous = false;
740 * In this case, its the caller's responsibility to ensure that allocate() is
741 * called for the output tensor before operating on the output tensor.
743 createSharedDataTensor(*this, ret, offset);
748 std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
750 NNTR_THROW_IF(num_size == 0, std::invalid_argument)
751 << "num size cannot be zero";
757 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
758 << "cannot split axis of axis: " << axis;
760 NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
761 << "axis is not divisible by num_size, axis: " << axis
762 << " num size: " << num_size;
765 auto new_dim = dim.getTensorDim(axis) / num_size;
766 ret_dim.setTensorDim(axis, new_dim);
768 auto iter_value = [this, &ret_dim](std::array<unsigned, 4> &loc) -> float & {
769 auto &value = getValue(loc[0], loc[1], loc[2], loc[3]);
770 for (int i = 3; i >= 0; --i) {
772 if (loc[i] % ret_dim.getTensorDim(i) == 0) {
773 loc[i] -= ret_dim.getTensorDim(i);
781 std::vector<Tensor> ret;
782 ret.reserve(num_size);
784 for (unsigned int i = 0; i < num_size; ++i) {
785 std::array<unsigned, 4> loc = {0, 0, 0, 0};
786 loc[axis] = new_dim * i;
787 ret.emplace_back(ret_dim);
788 auto &ret_t = ret.back();
790 ret_t.apply_i([&iter_value, &loc](float _) { return iter_value(loc); });
796 Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
802 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
803 << "cannot split axis of axis: " << axis;
805 NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
806 << "given tensor vector is empty";
808 auto ref_dim = tensors.front().getDim();
809 ref_dim.setTensorDim(axis, 1);
810 NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
811 [&ref_dim, axis](const Tensor &t) {
812 auto cur_dim = t.getDim();
813 cur_dim.setTensorDim(axis, 1);
814 return ref_dim == cur_dim;
816 std::invalid_argument)
817 << " all tensor must have the same dimension except for the axis, ref_dim: "
818 << ref_dim << " axis : " << axis;
820 auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
821 [axis](unsigned cur, const Tensor &t) {
822 return cur += t.getDim().getTensorDim(axis);
824 auto iter_value = [](std::array<unsigned, 4> &loc,
825 std::array<unsigned, 4> &start_loc, Tensor &t,
826 const TensorDim &ref_dim) -> float & {
827 auto &value = t.getValue(loc[0], loc[1], loc[2], loc[3]);
828 for (int i = 3; i >= 0; --i) {
830 if (loc[i] - start_loc[i] == ref_dim.getTensorDim(i)) {
831 loc[i] = start_loc[i];
839 auto ret_dim = ref_dim;
840 ret_dim.setTensorDim(axis, axis_dim);
842 auto ret = Tensor(ret_dim);
844 std::array<unsigned, 4> loc = {0, 0, 0, 0};
845 for (auto &t : tensors) {
846 std::array<unsigned, 4> start_loc = loc;
847 for (auto i = 0u, sz = t.size(); i < sz; ++i) {
848 iter_value(loc, start_loc, ret, t.getDim()) = t.getValue(i);
850 loc[axis] += t.getDim().getTensorDim(axis);
856 void Tensor::makeSharedDataTensor(const Tensor &src, unsigned int offset) {
857 if (strides != src.strides)
858 throw std::invalid_argument(
859 "Creating shared tensor of different stride than source tensor.");
861 if (getDim().getDataLen() + offset > src.getDim().getDataLen())
862 throw std::invalid_argument(
863 "Creating shared tensor of different size or stride than source tensor.");
866 * In this case, its the caller's responsibility to ensure that allocate() is
867 * called for the output tensor before operating on the output tensor.
869 createSharedDataTensor(src, *this, offset);
872 void Tensor::apply_broadcast(
874 std::function<void(const BroadcastInfo &e, const float *, const float *,
877 Tensor &output) const {
878 CREATE_IF_EMPTY_DIMS(output, dim);
880 NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
881 << getName() << " is not allocated";
882 NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
883 << m.getName() << " is not allocated";
884 NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
885 << output.getName() << " is not allocated";
887 /// shortcut to cover when dimension matches
888 /// note that buffer_size, the last stride is only used in v_func but it
892 e.buffer_size = size();
894 v_func(e, getData(), m.getData(), output.getData());
898 return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
901 void Tensor::apply_broadcast_util(
903 std::function<void(const BroadcastInfo &e, const float *, const float *,
906 Tensor &output, const BroadcastInfo &e, int cur_axis, unsigned int offset,
907 unsigned int m_offset) const {
909 const float *buf = this->getData();
910 const float *m_buf = m.getData();
911 float *out_buf = output.getData();
913 if (e.buffer_axis == cur_axis) {
914 v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
919 for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
920 unsigned int next_offset = offset + i * strides[cur_axis];
921 unsigned int next_m_offset = m_offset + i * e.strides[cur_axis];
922 apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
928 * This is to sum the Tensor data according to the dim.batch().
929 * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
931 Tensor Tensor::sum_by_batch() const {
932 NNTR_THROW_IF(!contiguous, std::invalid_argument)
933 << getName() << " is not contiguous, cannot sum";
935 Tensor ret(dim.batch(), 1, 1, 1);
936 unsigned int feat_len = dim.getFeatureLen();
937 unsigned int batch = dim.batch();
939 const float *data = getData();
940 float *rdata = ret.getData();
942 Tensor ones(1, 1, 1, feat_len);
944 sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
945 ones.getData(), 1, 0.0, rdata, 1);
951 * @brief Calculate sum according to the axis.
953 Tensor Tensor::sum(unsigned int axis, float alpha) const {
955 return sum(axis, ret, alpha, 0);
957 Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
959 const float *data = getData();
961 NNTR_THROW_IF(!contiguous, std::invalid_argument)
962 << getName() << " is not contiguous, cannot sum";
965 throw std::out_of_range("Error: axis is invalid");
967 if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
968 CREATE_IF_EMPTY_DIMS(ret, dim);
969 ret.copy(this->getData());
975 CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width());
976 unsigned int feat_len = dim.getFeatureLen();
977 unsigned int batch = dim.batch();
978 Tensor ones(1, 1, 1, batch);
979 ones.setValue(alpha);
980 sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
981 ones.getData(), 1, beta, ret.getData(), 1);
984 CREATE_IF_EMPTY_DIMS(ret, dim.batch(), 1, dim.height(), dim.width());
985 unsigned int feat_len = dim.height() * dim.width();
986 unsigned int channel = dim.channel();
987 Tensor ones(1, 1, 1, channel);
988 ones.setValue(alpha);
989 float *rdata = ret.getData();
990 for (unsigned int k = 0; k < dim.batch(); ++k) {
991 sgemv(CblasRowMajor, CblasTrans, channel, feat_len, 1,
992 &data[k * dim.getFeatureLen()], feat_len, ones.getData(), 1, beta,
993 &rdata[k * feat_len], 1);
997 CREATE_IF_EMPTY_DIMS(ret, dim.batch(), dim.channel(), 1, dim.width());
998 unsigned int width = dim.width();
999 unsigned int height = dim.height();
1000 Tensor ones(1, 1, 1, height);
1001 ones.setValue(alpha);
1002 float *rdata = ret.getData();
1003 for (unsigned int k = 0; k < dim.batch(); ++k) {
1004 for (unsigned int c = 0; c < dim.channel(); ++c) {
1006 k * dim.getFeatureLen() + c * dim.width() * dim.height();
1007 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim.width();
1008 sgemv(CblasRowMajor, CblasTrans, height, width, 1, &data[idx], width,
1009 ones.getData(), 1, beta, &rdata[ridx], 1);
1014 CREATE_IF_EMPTY_DIMS(ret, dim.batch(), dim.channel(), dim.height(), 1);
1015 unsigned int m = ret.dim.getDataLen();
1016 unsigned int n = dim.width();
1017 Tensor ones(1, 1, 1, n);
1018 ones.setValue(alpha);
1019 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n, ones.getData(), 1,
1020 beta, ret.getData(), 1);
1023 throw std::out_of_range("Error: Dimension cannot exceed 3");
1028 Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
1030 return sum(axes, ret, alpha);
1033 void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
1034 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1035 << getName() << " is not contiguous, cannot merge axis";
1037 if (axis2 != axis1 + 1)
1038 throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
1040 dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
1041 dim.setTensorDim(axis1, 1);
1044 Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
1045 float alpha) const {
1047 throw std::invalid_argument("empty axes given");
1049 if (axes.size() == 1) {
1050 this->sum(axes[0], output, alpha);
1052 /** club axes together */
1053 Tensor new_reshaped = *this;
1054 std::vector<unsigned int> new_axes = {axes[0]};
1055 for (unsigned int i = 1; i < axes.size(); ++i) {
1056 if (axes[i] == axes[i - 1] + 1) {
1057 new_reshaped.mergeAxis(axes[i - 1], axes[i]);
1058 new_axes.back() = axes[i];
1060 new_axes.push_back(axes[i]);
1064 Tensor ret = new_reshaped.sum(new_axes[0]);
1065 for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
1066 ret = ret.sum(axes[i]);
1067 ret.sum(new_axes.back(), output, alpha);
1073 Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
1074 bool trans_m, float beta) const {
1075 if (!result.isAllocated())
1076 throw std::invalid_argument(
1077 "Output tensor must be preallocated for dotBatched operation");
1078 for (unsigned int b = 0; b < batch(); b++) {
1079 /** @todo try using transpose to speedup the operation */
1080 const Tensor this_b = this->getBatchSlice(b, 1);
1081 Tensor m_b = m.getBatchSlice(b, 1);
1082 Tensor result_b = result.getBatchSlice(b, 1);
1084 this_b.dot(m_b, result_b, trans, trans_m, beta);
1090 Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
1092 dot(m, output, trans, trans_m);
1097 * @brief compute the derivative of this in the current tensor
1098 * @todo will have to see if beta effects this computation
1100 Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
1101 bool trans, bool trans_m, float beta) {
1102 bool deriv_trans_m = true;
1103 bool deriv_trans = false;
1104 /** @todo handle all cases of trans and trans_m */
1105 if (!trans && trans_m) {
1106 deriv_trans_m = false;
1109 return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
1113 * @brief compute the derivative wrt m in the m tensor
1114 * @note The caller tensor must be the same tensor as the one which called the
1117 Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
1118 bool trans, bool trans_m, float beta) const {
1119 bool deriv_trans_m = false;
1120 bool deriv_trans = true;
1121 /** @todo handle all cases of trans and trans_m */
1123 if (!trans && trans_m) {
1124 output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1127 return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1131 Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
1132 Tensor const &output_deriv, bool trans,
1133 bool trans_m, float beta) {
1134 bool deriv_trans_m = true;
1135 bool deriv_trans = false;
1136 /** @todo handle all cases of trans and trans_m */
1137 if (!trans && trans_m) {
1138 deriv_trans_m = false;
1141 return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
1144 Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
1145 Tensor const &output_deriv, bool trans,
1146 bool trans_m, float beta) const {
1147 bool deriv_trans_m = false;
1148 bool deriv_trans = true;
1149 /** @todo handle all cases of trans and trans_m */
1151 if (!trans && trans_m) {
1152 output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1155 return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1160 * @note: This dot product flattens the fist 3 axis for the purpose of
1161 * computation. So, while performing, these matrices are behaving as 2-D
1162 * matrices. The dimensions are restored while returning back the tensor
1163 * in case of trans is false.
1165 Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
1167 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1168 << getName() << " is not contiguous. Cannot dot product.";
1170 // Comment out with intension to support the calculation wrt. batch and height
1171 // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
1172 // [BxCxH,W] as well if (m.dim.rank() > 2) {
1173 // throw exception::not_supported("Error: support only for rank of dot "
1177 // Comment out with intension to support the calculation wrt. batch and height
1178 // direction of this tensor. It is OK as long as m is 2D
1180 if (trans && dim.rank() > 2) {
1181 ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
1184 unsigned int dim1 = batch() * channel() * height();
1185 unsigned int dim2 = width();
1186 unsigned int mdim1 = m.batch() * m.channel() * m.height();
1187 unsigned int mdim2 = m.width();
1189 unsigned int M, N, K, lda, ldb, ldc;
1191 if (!trans && !trans_m) {
1193 throw std::runtime_error(
1194 "Error: incompatible dimensions for dot product");
1195 K = mdim1; /** == dim2 */
1198 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N);
1200 // We are not set zero the result because of performnace reason.
1201 // However, result is not initialized properly. There might include
1202 // garbage like nan. When we have to use this value as in C = alpha*A*B +
1203 // beta*C, then have to check gargabe data of C is not effect or not.
1205 } else if (!trans && trans_m) {
1207 throw std::runtime_error(
1208 "Error: incompatible dimensions for dot product");
1209 K = mdim2; /** == dim2 */
1212 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N);
1213 } else if (trans && !trans_m) {
1215 throw std::runtime_error(
1216 "Error: incompatible dimensions for dot product");
1217 K = mdim1; /** == dim1 */
1220 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N);
1223 throw std::runtime_error(
1224 "Error: incompatible dimensions for dot product");
1225 K = mdim2; /** == dim1 */
1228 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N);
1232 ldc = result.width();
1234 const float *data = getData();
1235 const float *mdata = m.getData();
1236 float *rdata = result.getData();
1237 const float alpha = 1.0f;
1238 enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
1239 enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
1241 /// shortcut handling in case of vector
1242 /// for vector, (1 * K) == (K * 1) in current memory layout...
1243 /// and plaese note that N, K, M is a fixed place holder after considering
1245 /// For example, there is no case like (1 * K) X (1 * K) while
1246 /// (1 * K) X (1 * M) can be a case
1247 /// case1: (1 * K) X (K * 1)
1248 if (M == 1 && N == 1) {
1249 *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
1251 /// case2: (M * K) X (K * 1)
1253 sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
1256 /// case3: (1 * K) X (K * N) = 1 * N = R
1257 /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
1258 /// Effectively a translation of sgemv
1260 transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
1261 sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1, beta,
1264 /// case others: use gemm
1266 sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata, ldb,
1273 Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
1274 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1275 << getName() << " is not contiguous. Cannot transpose.";
1277 if (out.getData() == getData()) {
1278 Tensor tmp = clone();
1279 return tmp.transpose(direction, out);
1282 unsigned int SL, SI, SJ, SK;
1286 out.reshape(dim.transpose(direction));
1288 int indexI = direction[0] - '0';
1289 int indexJ = direction[2] - '0';
1291 SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
1294 outptr = out.getData();
1299 transposeloop(l, i, j, k, SL, SI, SJ, SK);
1301 transposeloop(l, i, k, j, SL, SI, SK, SJ);
1306 transposeloop(l, j, i, k, SL, SJ, SI, SK);
1308 transposeloop(l, j, k, i, SL, SJ, SK, SI);
1313 transposeloop(l, k, i, j, SL, SK, SI, SJ);
1315 transposeloop(l, k, j, i, SL, SK, SJ, SI);
1323 Tensor Tensor::transpose(const std::string &direction) const {
1325 transpose(direction, result);
1329 Tensor Tensor::dropout_mask(float dropout) const {
1331 result.dropout_mask(dropout);
1335 void Tensor::dropout_mask(float dropout) {
1336 setRandUniform(0.0, 1.0);
1337 float scale = 1.0 / (1 - dropout);
1338 float *data_ = getData();
1340 for (unsigned int i = 0; i < size(); ++i) {
1341 if (data_[i] >= dropout)
1348 void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
1349 float fill_mask_val = 0.0;
1350 float en_mask_val = 1.0 - fill_mask_val;
1353 fill_mask_val = 1.0;
1354 en_mask_val = 1.0 - fill_mask_val;
1357 setValue(fill_mask_val);
1358 if (mask_len.batch() != batch())
1359 throw std::invalid_argument("Number of filter masks mismatched");
1361 for (unsigned int b = 0; b < batch(); b++) {
1362 float *addr = getAddress(b, 0, 0, 0);
1363 const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
1364 std::fill(addr, addr + (*mask_len_val), en_mask_val);
1368 Tensor Tensor::zoneout_mask(float zoneout) {
1369 Tensor ret(getDim());
1370 zoneout_mask(ret, zoneout);
1374 void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
1375 if (dim != opposite.dim) {
1376 throw std::invalid_argument(
1377 "[Tensor::zoneout_mask] opposite dimension does not match");
1380 opposite.setRandBernoulli(zoneout);
1381 float *data = getData();
1382 float *opposite_data = opposite.getData();
1384 for (unsigned int i = 0; i < size(); ++i) {
1385 if (opposite_data[i] > epsilon) {
1393 int Tensor::apply_i(std::function<float(float)> f) {
1394 Tensor result = *this;
1397 return ML_ERROR_NONE;
1400 Tensor Tensor::apply(std::function<float(float)> f) const {
1402 return apply(f, result);
1405 Tensor &Tensor::apply(std::function<float(float)> f, Tensor &output) const {
1406 CREATE_IF_EMPTY_DIMS(output, dim);
1408 if (dim != output.dim) {
1409 /// @todo add unittest
1410 throw std::invalid_argument(
1411 "[Tensor::apply] output dimension does not match");
1414 if (contiguous && output.contiguous) {
1415 const float *data = getData();
1416 float *rdata = output.getData();
1417 std::transform(data, data + size(), rdata, f);
1418 } else if (strides[3] == 1 && output.strides[3] == 1) {
1419 /** @todo optimize this with combining these loops where stride is 1 */
1420 for (unsigned int b = 0; b < batch(); ++b) {
1421 for (unsigned int c = 0; c < channel(); ++c) {
1422 for (unsigned int h = 0; h < height(); ++h) {
1423 float *out_data = output.getAddress(b, c, h, 0);
1424 const float *in_data = getAddress(b, c, h, 0);
1425 std::transform(in_data, in_data + width(), out_data, f);
1430 for (unsigned int b = 0; b < batch(); ++b) {
1431 for (unsigned int c = 0; c < channel(); ++c) {
1432 for (unsigned int h = 0; h < height(); ++h) {
1433 for (unsigned int w = 0; w < width(); ++w) {
1434 output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
1444 Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
1446 Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
1447 Tensor &output) const {
1448 return f(*this, output);
1451 void Tensor::print(std::ostream &out) const {
1452 printInstance(out, this);
1453 const float *data = getData();
1455 unsigned int len = size();
1456 out << "data addr: " << data << '\n';
1460 out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
1461 << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
1466 std::ios init(NULL);
1468 for (unsigned int k = 0; k < dim.batch(); k++) {
1469 for (unsigned int l = 0; l < dim.channel(); l++) {
1470 for (unsigned int i = 0; i < dim.height(); i++) {
1471 for (unsigned int j = 0; j < dim.width(); j++) {
1472 out << std::setw(10) << std::setprecision(10)
1473 << this->getValue(k, l, i, j) << " ";
1479 out << "-------" << std::endl;
1484 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
1489 void Tensor::copy(const float *buf) {
1490 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1491 << getName() << "Tensor is not contiguous, cannot copy.";
1493 if (buf == getData()) {
1497 scopy(size(), buf, 1, getData(), 1);
1500 void Tensor::copy_with_stride(const Tensor &from) {
1502 if (dim == from.getDim()) {
1503 for (unsigned int b = 0; b < batch(); ++b) {
1504 for (unsigned int c = 0; c < channel(); ++c) {
1505 for (unsigned int h = 0; h < height(); ++h) {
1506 for (unsigned int w = 0; w < width(); ++w) {
1507 setValue(b, c, h, w, from.getValue(b, c, h, w));
1513 Tensor t = Tensor(from.getDim(), true);
1514 for (unsigned int b = 0; b < t.batch(); ++b) {
1515 for (unsigned int c = 0; c < t.channel(); ++c) {
1516 for (unsigned int h = 0; h < t.height(); ++h) {
1517 for (unsigned int w = 0; w < t.width(); ++w) {
1518 t.setValue(b, c, h, w, from.getValue(b, c, h, w));
1527 void Tensor::copy(const Tensor &from) {
1528 // todo: enable copy to non-contiguous tensor
1530 throw std::runtime_error("Cannot copy non-contiguous tensor");
1533 if (from.size() != 0 && size() == from.size()) {
1534 reshape(from.getDim());
1535 copy(from.getData());
1537 Tensor t = Tensor(from.getDim(), from.getData());
1542 void Tensor::copyData(const Tensor &from) {
1543 // todo: enable copy to non-contiguous tensor
1545 throw std::runtime_error("Cannot copy non-contiguous tensor");
1548 if (size() != from.size())
1549 throw std::invalid_argument("Size of tensor to copy must match");
1550 copy(from.getData());
1553 Tensor Tensor::clone() const {
1560 void Tensor::reshape(const TensorDim &d) {
1562 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1563 << getName() << " is not contiguous, cannot reshape.";
1565 NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
1566 << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
1568 << getDim() << " to " << d;
1571 strides = d.computeStrides();
1574 void Tensor::fill(const Tensor &from, bool alloc) {
1575 if (alloc && this->empty()) {
1580 if (!from.contiguous || !contiguous) {
1581 /// @todo enable this if needed
1582 throw nntrainer::exception::not_supported(
1583 "[Tensor::fill] non-contiguous tensors are not supported");
1586 if (dim != from.getDim()) {
1587 throw std::invalid_argument("[Tensor::fill] dimension must be the same");
1590 if (strides != from.getStrides()) {
1591 /// @todo length does not represent buffer size, there should be way to get
1593 throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
1596 this->copy(from.getData());
1599 void Tensor::save(std::ostream &file) {
1600 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1601 << getName() << " is not contiguous, cannot save.";
1603 std::streamsize sz = static_cast<std::streamsize>(bytes());
1604 NNTR_THROW_IF(sz < 0, std::invalid_argument)
1605 << "save size: " << bytes()
1606 << " is too big. It cannot be represented by std::streamsize";
1608 checkedWrite(file, (char *)getData(), sz, "[Tensor::save] operation failed");
1612 void Tensor::read(std::ifstream &file) {
1613 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1614 << getName() << " is not contiguous, cannot read.";
1616 std::streamsize sz = static_cast<std::streamsize>(bytes());
1618 NNTR_THROW_IF(sz < 0, std::invalid_argument)
1619 << "read size: " << bytes()
1620 << " is too big. It cannot be represented by std::streamsize";
1622 checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
1627 * @brief Calculate average value according to the axis.
1629 Tensor Tensor::average(unsigned int axis) const {
1631 return average(axis, t);
1635 * @brief Calculate average value according to the axis.
1637 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
1638 if (axis >= TensorDim::MAXDIM)
1639 throw std::out_of_range(
1640 "negative axis or axis more then MAXDIM is invalid");
1642 unsigned int axis_size = dim.getDim()[axis];
1646 this->sum(axis, output, 1.0 / ((float)axis_size));
1651 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
1653 return average(axes, t);
1656 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
1657 Tensor &output) const {
1659 return this->average(output);
1661 TensorDim ret_shape;
1662 for (const auto &idx : axes) {
1663 if (idx >= TensorDim::MAXDIM) {
1664 throw std::out_of_range("axis more then MAXDIM is invalid");
1666 ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
1669 return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
1673 * @brief Calculate average value according to the axis.
1675 Tensor Tensor::average() const {
1676 Tensor result = *this;
1677 result.reshape({1, 1, 1, dim.getDataLen()});
1678 return result.average(3);
1682 * @brief Calculate average value according to the axis.
1684 Tensor &Tensor::average(Tensor &output) const {
1685 Tensor result = *this;
1686 result.reshape({1, 1, 1, dim.getDataLen()});
1687 return result.average(3, output);
1690 void Tensor::setValue(float val) {
1691 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1692 << getName() << " is not contiguous, cannot set value.";
1694 float *data = getData();
1695 std::fill(data, data + size(), val);
1698 void Tensor::setZero() {
1700 sscal(size(), 0, getData(), 1);
1702 apply_i([](float val) -> float { return 0; });
1705 std::vector<unsigned int> Tensor::argmax() const {
1706 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1707 << getName() << " is not contiguous, cannot get argmax.";
1709 const float *data = getData();
1710 std::vector<unsigned int> result;
1711 unsigned int batch_size = batch();
1712 unsigned int feature_len = dim.getFeatureLen();
1714 result.resize(batch_size);
1716 for (unsigned int b = 0; b < batch_size; b++) {
1718 std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
1719 result[b] = std::distance(data, max_iter) - (b * feature_len);
1725 float Tensor::l2norm() const {
1726 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1727 << getName() << " is not contiguous, cannot get l2norm.";
1729 unsigned int len = size();
1730 const float *data = getData();
1732 return snrm2(len, data, 1);
1735 float Tensor::max_abs() const {
1736 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1737 << getName() << " is not contiguous, cannot get max_abs.";
1739 unsigned int len = size();
1740 const float *data = getData();
1742 unsigned int idx = isamax(len, data, 1);
1743 return *(data + idx);
1746 Tensor &Tensor::normalization(Tensor &output) const {
1748 output = Tensor(dim);
1751 output.normalization_i();
1756 void Tensor::normalization_i() {
1757 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1758 << getName() << " is not contiguous, cannot do normalization.";
1760 const float *data = getData();
1762 auto bounds = std::minmax_element(data, data + size());
1763 const float min = *bounds.first;
1764 const float max = *bounds.second;
1768 this->subtract_i(tmp);
1770 this->subtract_i(min);
1771 this->divide_i(max - min);
1775 LazyTensor Tensor::chain() const { return LazyTensor(*this); }
1777 Tensor &Tensor::standardization(Tensor &output) const {
1779 output = Tensor(dim);
1782 output.standardization_i();
1787 void Tensor::standardization_i() {
1788 Tensor mean_by_batch = this->sum_by_batch();
1789 mean_by_batch.divide_i(dim.getFeatureLen());
1791 this->subtract_i(mean_by_batch);
1793 Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
1794 std_dev_by_batch.setZero();
1795 float *std_dev = std_dev_by_batch.getData();
1797 for (unsigned int k = 0; k < dim.batch(); ++k) {
1798 Tensor sub_this = this->getBatchSlice(k, 1);
1799 std_dev[k] = sub_this.l2norm();
1802 std_dev_by_batch.divide_i(dim.getFeatureLen());
1803 this->divide_i(std_dev_by_batch);
1806 Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
1807 if (m.size() > this->size())
1808 throw exception::not_supported("broadcasting *this is not supported");
1810 const TensorDim m_dim = m.getDim();
1814 /// checking if given Tensor's can be broadcasted
1815 for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
1816 if (dim.getTensorDim(i) == m_dim.getTensorDim(i)) {
1817 e.strides[i] = m.strides[i];
1821 /// If given dimension is 1, it could be reused, the stride remaining 0
1822 /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
1823 /// If so, strides should not change
1824 if (m_dim.getTensorDim(i) == 1) {
1828 std::stringstream ss;
1829 ss << "[computeBroadcastInfo] broadcasting only allowed for "
1830 "dimension value of 1 \n"
1831 << "this: " << dim << "target: " << m_dim;
1832 throw std::invalid_argument(ss.str().c_str());
1835 /// calculate inner loop size
1838 e.strides[3] = m.strides[3];
1840 /// initiate buffer info with matching dimension strategy
1841 for (int axis = 3; axis >= 0; --axis) {
1842 if (dim.getTensorDim(axis) != m_dim.getTensorDim(axis)) {
1843 e.buffer_axis = axis;
1847 e.buffer_size *= dim.getTensorDim(axis);
1850 /// check strategy that uses consecutive ones
1851 if (m_dim.getTensorDim(3) == 1) {
1852 unsigned int inner_loop_size = 1;
1854 for (axis = 3; axis >= 0; --axis) {
1855 if (m_dim.getTensorDim(axis) != 1) {
1859 inner_loop_size *= dim.getTensorDim(axis);
1862 /// if consecutive-one strategy has bigger chunk size, replace the
1864 if (inner_loop_size > e.buffer_size) {
1865 e.buffer_axis = axis;
1866 e.buffer_size = inner_loop_size;
1874 } /* namespace nntrainer */