2 * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * http://www.apache.org/licenses/LICENSE-2.0
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
16 * @date 04 December 2019
17 * @brief This is Tensor class for calculation
18 * @see https://github.com/nnstreamer/nntrainer
19 * @author Jijoong Moon <jijoong.moon@samsung.com>
20 * @bug No known bugs except for NYI items
38 #include <lazy_tensor.h>
39 #include <nntrainer_log.h>
41 #include <util_func.h>
43 #define transposeloop(cl, ci, cj, ck, sl, si, sj, sk) \
45 unsigned int i, j, k, l; \
46 int inidx = 0, outidx = 0; \
47 for (cl = 0; cl < sl; cl++) \
48 for (ci = 0; ci < si; ci++) \
49 for (cj = 0; cj < sj; cj++) \
50 for (ck = 0; ck < sk; ck++) { \
51 outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
52 inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k; \
53 outptr[outidx] = inptr[inidx]; \
57 #define transposeloop_nhwc(cl, ci, cj, ck, sl, si, sj, sk) \
59 unsigned int i, j, k, l; \
60 int inidx = 0, outidx = 0; \
61 for (cl = 0; cl < sl; cl++) \
62 for (ci = 0; ci < si; ci++) \
63 for (cj = 0; cj < sj; cj++) \
64 for (ck = 0; ck < sk; ck++) { \
65 outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
66 inidx = l * SJ * SK * SI + j * SK * SI + k * SI + i; \
67 outptr[outidx] = inptr[inidx]; \
74 * @struct External Loop Info for broadcasted info
75 * @brief External Loop Info for broadcasted iteration. Please refer to
76 * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
77 * @note This should better be implemented in iterator fashion before used
80 struct Tensor::BroadcastInfo {
83 * @brief Construct a new External Loop Info object
90 tensor_type(nntrainer::TensorDim::TensorType()) {}
92 unsigned int buffer_size; /**< virtual size of the buffer */
93 int buffer_axis; /**< the smallest axis that should be looped.
94 -1 means no loop needed*/
95 std::array<unsigned int, TensorDim::MAXDIM>
96 strides; /**< modified strides for the loop */
97 nntrainer::TensorDim::TensorType tensor_type;
100 Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
103 if (d.getDataLen() != 0) {
105 strides = d.computeStrides();
112 Tensor::Tensor(const TensorDim &d, const void *buf) : Tensor(d, true) {
113 if (d.getDataLen() != 0) {
120 * @class SrcSharedTensor
121 * @brief Source of the shared tensor
123 class SrcSharedTensor {
126 * @brief Constructor for the class
128 SrcSharedTensor() : src(nullptr), off(0) {}
130 SrcSharedTensor(const Tensor *tensor, size_t offset) :
131 src(tensor), off(offset) {}
134 * @brief Get the allocated src tensor
136 const Tensor *tensor() const {
138 throw std::runtime_error("Accessing empty src tensor");
144 * @brief Get the offset from the source tensor
146 size_t offset() const { return off; }
149 const Tensor *src; /**< Tensor of the source */
150 size_t off; /**< offset from the source data ptr */
153 void Tensor::allocate() {
155 /// already allocated
159 /// allocate data based on the source tensor
160 data = src_tensor->tensor()->data;
161 offset = src_tensor->tensor()->offset + src_tensor->offset();
162 /** as this memory is shared, do NOT initialize */
164 /// allocate new memory for the tensor data
166 MemoryData *mem_data;
168 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
169 mem_data = new MemoryData((void *)(new float[dim.getDataLen()]()));
170 data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
171 delete[] (float *)mem_data->getAddr();
175 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
176 mem_data = new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
177 data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
178 delete[] (__fp16 *)mem_data->getAddr();
187 bool Tensor::operator==(const Tensor &rhs) const {
188 if (this->dim != rhs.dim)
193 if (len != rhs.size())
196 if (contiguous != rhs.contiguous)
199 if (strides != rhs.strides)
202 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
203 const float *_data = getData<float>();
204 const float *_rdata = rhs.getData<float>();
205 for (size_t i = 0; i < len; ++i) {
206 /** not checking sign change is intentional to avoid float calculation
208 if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
209 (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
210 std::fabs(_data[i] - _rdata[i]) > epsilon)
213 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
214 const __fp16 *_data = getData<__fp16>();
215 const __fp16 *_rdata = rhs.getData<__fp16>();
216 for (size_t i = 0; i < len; ++i) {
217 if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
218 (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
219 std::fabs(_data[i] - _rdata[i]) > epsilon)
227 void Tensor::setRandNormal(float mean, float std) {
228 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
229 setDist<float, std::normal_distribution<float>>(
230 std::normal_distribution<float>(mean, std));
231 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
232 throw std::invalid_argument(
233 "__fp16 is not supported by std::normal_distribution");
237 void Tensor::setRandUniform(float min, float max) {
238 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
239 setDist<float, std::uniform_real_distribution<float>>(
240 std::uniform_real_distribution<float>(min, max));
241 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
242 throw std::invalid_argument(
243 "__fp16 is not supported by std::uniform_real_distribution");
247 void Tensor::setRandBernoulli(float probability) {
248 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
249 setDist<float, std::bernoulli_distribution>(
250 std::bernoulli_distribution(probability));
251 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
252 setDist<__fp16, std::bernoulli_distribution>(
253 std::bernoulli_distribution((__fp16)probability));
257 void Tensor::initialize() {
258 if (empty() || !isAllocated())
261 unsigned int fan_in, fan_out;
263 /// @fixme: when unit is equal to one, this does not work, we need to rely on
264 /// effective dimension then actual numbers here. For now, some heuristics
265 /// added to infer what would be fan_in/fan_out
266 if (dim.batch() * dim.channel() * dim.height() == 1) {
267 fan_out = fan_in = dim.width();
268 } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
269 fan_in = dim.height();
270 fan_out = dim.width();
271 } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
272 auto field_size = dim.height() * dim.width();
274 // this also handles below cases.
275 // 1. fan_in = fan_out = 1 as well.
276 // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
277 fan_in = dim.channel() * field_size;
278 fan_out = dim.batch() * field_size;
281 switch (initializer) {
282 case Tensor::Initializer::ZEROS:
285 case Tensor::Initializer::ONES:
288 case Tensor::Initializer::LECUN_NORMAL:
289 setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
291 case Tensor::Initializer::XAVIER_NORMAL:
292 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
294 case Tensor::Initializer::HE_NORMAL:
295 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
297 case Tensor::Initializer::LECUN_UNIFORM:
298 setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
300 case Tensor::Initializer::XAVIER_UNIFORM:
301 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
302 sqrtFloat(6.0 / (fan_in + fan_out)));
304 case Tensor::Initializer::HE_UNIFORM:
305 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
306 sqrtFloat(6.0 / (fan_in)));
315 int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
317 this->multiply_strided(m, *this, beta);
318 } catch (std::exception &err) {
319 ml_loge("%s %s", typeid(err).name(), err.what());
320 return ML_ERROR_INVALID_PARAMETER;
323 return ML_ERROR_NONE;
326 Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
328 return this->multiply_strided(m, t, beta);
331 Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
332 const float beta) const {
333 /** TODO: throw than create new dimenions */
334 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
336 if (size() != m.size() || size() != output.size())
337 throw std::invalid_argument(
338 "Strided multiplication does not support broadcasting");
340 if (getDataType() == Tdatatype::FP32) {
341 NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
342 << getName() << " is not allocated";
343 NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
344 << m.getName() << " is not allocated";
345 NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
346 << output.getName() << " is not allocated";
347 } else if (getDataType() == Tdatatype::FP16) {
348 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
349 << getName() << " is not allocated";
350 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
351 << m.getName() << " is not allocated";
352 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
353 << output.getName() << " is not allocated";
357 if (this->getFormat() == Tformat::NCHW) {
358 if (getDataType() == Tdatatype::FP32) {
359 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
361 for (unsigned int b = 0; b < batch(); ++b) {
362 for (unsigned int c = 0; c < channel(); ++c) {
363 for (unsigned int h = 0; h < height(); ++h) {
364 for (unsigned int w = 0; w < width(); ++w) {
365 output.addValue(b, c, h, w,
366 getValue<float>(b, c, h, w) *
367 m.getValue<float>(b, c, h, w),
374 /** @todo optimize this with combining these loops where stride is 1
376 for (unsigned int b = 0; b < batch(); ++b) {
377 for (unsigned int c = 0; c < channel(); ++c) {
378 for (unsigned int h = 0; h < height(); ++h) {
379 float *out_data = output.getAddress<float>(b, c, h, 0);
380 const float *m_data = m.getAddress<float>(b, c, h, 0);
381 const float *in_data = getAddress<float>(b, c, h, 0);
382 std::transform(in_data, in_data + width(), m_data, out_data,
383 std::multiplies<float>());
388 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
389 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
391 for (unsigned int b = 0; b < batch(); ++b) {
392 for (unsigned int c = 0; c < channel(); ++c) {
393 for (unsigned int h = 0; h < height(); ++h) {
394 for (unsigned int w = 0; w < width(); ++w) {
395 output.addValue(b, c, h, w,
396 getValue<__fp16>(b, c, h, w) *
397 m.getValue<__fp16>(b, c, h, w),
404 for (unsigned int b = 0; b < batch(); ++b) {
405 for (unsigned int c = 0; c < channel(); ++c) {
406 for (unsigned int h = 0; h < height(); ++h) {
407 __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
408 const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
409 const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
410 std::transform(in_data, in_data + width(), m_data, out_data,
411 std::multiplies<__fp16>());
417 } else { // Format NHWC Case
418 if (getDataType() == Tdatatype::FP32) {
419 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
421 for (unsigned int b = 0; b < batch(); ++b) {
422 for (unsigned int h = 0; h < height(); ++h) {
423 for (unsigned int w = 0; w < width(); ++w) {
424 for (unsigned int c = 0; c < channel(); ++c) {
425 output.addValue(b, c, h, w,
426 getValue<float>(b, c, h, w) *
427 m.getValue<float>(b, c, h, w),
434 /** @todo optimize this with combining these loops where
436 for (unsigned int b = 0; b < batch(); ++b) {
437 for (unsigned int h = 0; h < height(); ++h) {
438 for (unsigned int w = 0; w < width(); ++w) {
439 float *out_data = output.getAddress<float>(b, 0, h, w);
440 const float *m_data = m.getAddress<float>(b, 0, h, w);
441 const float *in_data = getAddress<float>(b, 0, h, w);
442 std::transform(in_data, in_data + channel(), m_data, out_data,
443 std::multiplies<float>());
448 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
449 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
451 for (unsigned int b = 0; b < batch(); ++b) {
452 for (unsigned int h = 0; h < height(); ++h) {
453 for (unsigned int w = 0; w < width(); ++w) {
454 for (unsigned int c = 0; c < channel(); ++c) {
455 output.addValue(b, c, h, w,
456 getValue<__fp16>(b, c, h, w) *
457 m.getValue<__fp16>(b, c, h, w),
464 /** @todo optimize this with combining these loops where
466 for (unsigned int b = 0; b < batch(); ++b) {
467 for (unsigned int h = 0; h < height(); ++h) {
468 for (unsigned int w = 0; w < width(); ++w) {
469 __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
470 const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
471 const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
472 std::transform(in_data, in_data + channel(), m_data, out_data,
473 std::multiplies<__fp16>());
484 int Tensor::add_i_strided(Tensor const &m, const float beta) {
486 this->add_strided(m, *this, beta);
487 } catch (std::exception &err) {
488 ml_loge("%s %s", typeid(err).name(), err.what());
489 return ML_ERROR_INVALID_PARAMETER;
492 return ML_ERROR_NONE;
495 Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
497 return this->add_strided(m, t, beta);
500 Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
501 const float beta) const {
502 /** TODO: throw than create new dimenions */
503 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
505 if (size() != m.size() || size() != output.size())
506 throw std::invalid_argument(
507 "Strided addition does not support broadcasting");
509 if (getDataType() == Tdatatype::FP32) {
510 NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
511 << getName() << " is not allocated";
512 NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
513 << m.getName() << " is not allocated";
514 NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
515 << output.getName() << " is not allocated";
516 } else if (getDataType() == Tdatatype::FP16) {
517 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
518 << getName() << " is not allocated";
519 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
520 << m.getName() << " is not allocated";
521 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
522 << output.getName() << " is not allocated";
526 if (this->getFormat() == Tformat::NCHW) {
527 if (getDataType() == Tdatatype::FP32) {
528 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
530 for (unsigned int b = 0; b < batch(); ++b) {
531 for (unsigned int c = 0; c < channel(); ++c) {
532 for (unsigned int h = 0; h < height(); ++h) {
533 for (unsigned int w = 0; w < width(); ++w) {
534 output.setValue(b, c, h, w,
535 getValue<float>(b, c, h, w) +
536 m.getValue<float>(b, c, h, w) * beta);
542 /** @todo optimize this with combining these loops where stride is 1 */
543 for (unsigned int b = 0; b < batch(); ++b) {
544 for (unsigned int c = 0; c < channel(); ++c) {
545 for (unsigned int h = 0; h < height(); ++h) {
546 float *out_data = output.getAddress<float>(b, c, h, 0);
547 const float *m_data = m.getAddress<float>(b, c, h, 0);
548 const float *in_data = getAddress<float>(b, c, h, 0);
549 std::transform(in_data, in_data + width(), m_data, out_data,
555 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
556 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
558 for (unsigned int b = 0; b < batch(); ++b) {
559 for (unsigned int c = 0; c < channel(); ++c) {
560 for (unsigned int h = 0; h < height(); ++h) {
561 for (unsigned int w = 0; w < width(); ++w) {
562 output.setValue(b, c, h, w,
563 getValue<__fp16>(b, c, h, w) +
564 m.getValue<__fp16>(b, c, h, w) * beta);
570 for (unsigned int b = 0; b < batch(); ++b) {
571 for (unsigned int c = 0; c < channel(); ++c) {
572 for (unsigned int h = 0; h < height(); ++h) {
573 __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
574 const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
575 const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
576 std::transform(in_data, in_data + width(), m_data, out_data,
577 std::plus<__fp16>());
583 } else { // Format NHWC Case
584 if (getDataType() == Tdatatype::FP32) {
585 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
587 for (unsigned int b = 0; b < batch(); ++b) {
588 for (unsigned int h = 0; h < height(); ++h) {
589 for (unsigned int w = 0; w < width(); ++w) {
590 for (unsigned int c = 0; c < channel(); ++c) {
591 output.setValue(b, c, h, w,
592 getValue<float>(b, c, h, w) +
593 m.getValue<float>(b, c, h, w) * beta);
599 /** @todo optimize this with combining these loops where
601 for (unsigned int b = 0; b < batch(); ++b) {
602 for (unsigned int h = 0; h < height(); ++h) {
603 for (unsigned int w = 0; w < width(); ++w) {
604 float *out_data = output.getAddress<float>(b, 0, h, w);
605 const float *m_data = m.getAddress<float>(b, 0, h, w);
606 const float *in_data = getAddress<float>(b, 0, h, w);
607 std::transform(in_data, in_data + channel(), m_data, out_data,
613 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
614 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
616 for (unsigned int b = 0; b < batch(); ++b) {
617 for (unsigned int h = 0; h < height(); ++h) {
618 for (unsigned int w = 0; w < width(); ++w) {
619 for (unsigned int c = 0; c < channel(); ++c) {
620 output.setValue(b, c, h, w,
621 getValue<__fp16>(b, c, h, w) +
622 m.getValue<__fp16>(b, c, h, w) * beta);
628 /** @todo optimize this with combining these loops where
630 for (unsigned int b = 0; b < batch(); ++b) {
631 for (unsigned int h = 0; h < height(); ++h) {
632 for (unsigned int w = 0; w < width(); ++w) {
633 __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
634 const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
635 const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
636 std::transform(in_data, in_data + channel(), m_data, out_data,
637 std::plus<__fp16>());
647 int Tensor::multiply_i(float const &value) {
648 NNTR_THROW_IF(!contiguous, std::invalid_argument)
649 << getName() << " is not contiguous, cannot multiply";
651 /// @note this is not depending on multiply_i as there is an optimized
652 /// version for multiply_i
653 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
654 float *data = getData<float>();
655 unsigned int len = size();
657 sscal(len, value, data, 1);
658 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
659 __fp16 *data = getData<__fp16>();
660 unsigned int len = size();
661 sscal(len, value, data, 1);
663 return ML_ERROR_NONE;
666 Tensor Tensor::multiply(float const &value) const {
668 return multiply(value, t);
671 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
672 /// @todo add unittest
673 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
674 auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
675 return apply(f, out);
676 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
677 auto f = std::bind(std::multiplies<__fp16>(), std::placeholders::_1, value);
678 return apply(f, out);
683 int Tensor::multiply_i(Tensor const &m, const float beta) {
685 this->multiply(m, *this, beta);
686 } catch (std::exception &err) {
687 ml_loge("%s %s", typeid(err).name(), err.what());
688 return ML_ERROR_INVALID_PARAMETER;
691 return ML_ERROR_NONE;
694 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
695 Tensor t("", this->getFormat());
696 return this->multiply(m, t, beta);
699 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
700 const float beta) const {
702 * @note this does not work correctly with differently strided inputs.
703 * Use multiply_strided alternatively
705 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
706 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
708 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
710 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
711 std::multiplies<float>());
713 for (unsigned int i = 0; i < e.buffer_size; ++i) {
714 *out_buf = *buf * *m_buf + beta * *out_buf;
716 m_buf += e.strides[3];
717 out_buf += output.strides[3];
722 NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
723 << "Tensor Format of " << getName() << ":"
724 << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
725 << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
727 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
728 std::invalid_argument)
729 << getName() << " is not contiguous, cannot multiply";
731 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
732 std::invalid_argument)
733 << getName() << " is not contiguous, cannot multiply";
735 apply_broadcast(m, f, output);
738 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
739 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
741 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
743 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
744 std::multiplies<__fp16>());
746 for (unsigned int i = 0; i < e.buffer_size; ++i) {
747 *out_buf = *buf * *m_buf + beta * *out_buf;
749 m_buf += e.strides[3];
750 out_buf += output.strides[3];
755 NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
756 << "Tensor Format of " << getName() << ":"
757 << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
758 << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
760 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
761 std::invalid_argument)
762 << getName() << " is not contiguous, cannot multiply";
764 apply_broadcast(m, f, output);
770 int Tensor::divide_i(float const &value) {
772 return ML_ERROR_INVALID_PARAMETER;
774 this->divide(value, *this);
775 return ML_ERROR_NONE;
778 Tensor Tensor::divide(float const &value) const {
780 return divide(value, t);
783 Tensor &Tensor::divide(float const &value, Tensor &out) const {
784 auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
785 /// @todo add unittest, __fp16 ZeroDivisionError
787 std::stringstream ss;
788 ss << "[Tensor] divide by value failed, value: " << value;
789 throw std::invalid_argument(ss.str().c_str());
791 return apply(f, out);
794 int Tensor::divide_i(Tensor const &m) {
796 this->divide(m, *this);
797 } catch (std::exception &err) {
798 ml_loge("%s %s", typeid(err).name(), err.what());
799 return ML_ERROR_INVALID_PARAMETER;
802 return ML_ERROR_NONE;
805 Tensor Tensor::divide(Tensor const &m) const {
807 return this->divide(m, t);
810 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
811 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
812 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
814 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
815 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
816 std::divides<float>());
818 for (unsigned int i = 0; i < e.buffer_size; ++i) {
819 *out_buf = *buf / *m_buf;
821 m_buf += e.strides[3];
822 out_buf += output.strides[3];
827 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
828 std::invalid_argument)
829 << getName() << " is not contiguous, cannot divide";
831 apply_broadcast(m, f, output);
832 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
833 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
835 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
836 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
837 std::divides<__fp16>());
839 for (unsigned int i = 0; i < e.buffer_size; ++i) {
840 *out_buf = *buf / *m_buf;
842 m_buf += e.strides[3];
843 out_buf += output.strides[3];
848 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
849 std::invalid_argument)
850 << getName() << " is not contiguous, cannot divide";
852 apply_broadcast(m, f, output);
857 int Tensor::add_i(float const &value) {
858 this->add(value, *this);
859 return ML_ERROR_NONE;
862 Tensor Tensor::add(float const &value) const {
864 return add(value, t);
867 Tensor &Tensor::add(float const &value, Tensor &out) const {
868 /// @todo add unittest
869 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
870 auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
871 return apply(f, out);
872 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
873 auto f = std::bind(std::plus<__fp16>(), std::placeholders::_1, value);
874 return apply(f, out);
879 int Tensor::add_i(Tensor const &m, float const alpha) {
880 /// @todo: add axis rather doing add over the last two dimensions always
881 /// operator i has optimized version
882 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
883 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
885 saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
888 /// @todo: enable this after add_strided supports broadcast
889 // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
890 // << getName() << " is not contiguous, cannot add";
893 apply_broadcast(m, f, *this);
894 } catch (std::exception &err) {
895 ml_loge("%s %s", typeid(err).name(), err.what());
896 return ML_ERROR_INVALID_PARAMETER;
899 return ML_ERROR_NONE;
900 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
901 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
903 saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
904 /// @todo: saxpy is not valid for __fp16
907 /// @todo: enable this after add_strided supports broadcast
908 // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
909 // << getName() << " is not contiguous, cannot add";
912 apply_broadcast(m, f, *this);
913 } catch (std::exception &err) {
914 ml_loge("%s %s", typeid(err).name(), err.what());
915 return ML_ERROR_INVALID_PARAMETER;
918 return ML_ERROR_NONE;
922 Tensor Tensor::add(Tensor const &m, float const alpha) const {
924 return this->add(m, t, alpha);
927 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
928 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
929 std::invalid_argument)
930 << getName() << " is not contiguous, cannot add";
932 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
933 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
935 if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
937 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
940 for (unsigned int i = 0; i < e.buffer_size; ++i) {
941 *out_buf = *buf + *m_buf * alpha;
943 m_buf += e.strides[3];
944 out_buf += strides[3];
948 apply_broadcast(m, f, output);
949 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
950 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
952 if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
954 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
955 std::plus<__fp16>());
957 for (unsigned int i = 0; i < e.buffer_size; ++i) {
958 *out_buf = *buf + *m_buf * alpha;
960 m_buf += e.strides[3];
961 out_buf += strides[3];
965 apply_broadcast(m, f, output);
970 int Tensor::subtract_i(float const &value) {
971 this->subtract(value, *this);
972 return ML_ERROR_NONE;
975 Tensor Tensor::subtract(float const &value) const {
977 return subtract(value, t);
980 Tensor &Tensor::subtract(float const &value, Tensor &out) const {
981 /// @todo add unittest
982 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
983 auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
984 return apply(f, out);
985 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
986 auto f = std::bind(std::minus<__fp16>(), std::placeholders::_1, value);
987 return apply(f, out);
991 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
993 Tensor Tensor::subtract(Tensor const &m) const { return add(m, -1); }
995 Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
996 return add(m, out, -1);
999 int Tensor::pow_i(float exponent) {
1000 pow(exponent, *this);
1001 return ML_ERROR_NONE;
1004 Tensor Tensor::pow(float exponent) const {
1006 return pow(exponent, t);
1009 Tensor &Tensor::pow(float exponent, Tensor &out) const {
1010 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1011 auto f = [exponent](float in) { return powf(in, exponent); };
1012 return apply(f, out);
1014 if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1015 auto f = [exponent](__fp16 in) { return powf(in, exponent); };
1016 return apply(f, out);
1020 Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
1021 TensorDim dim_ = dim;
1024 return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
1027 void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
1030 * - If src already has data allocaed, then directly make dest tensor based on
1032 * - If src.data does not exist (meaning tensor does not memory allocated),
1033 * and src.src_tensor does not exist (meaning the src tensor does not depened
1034 * on another tensor), then create a SrcSharedTensor around the src.
1035 * - If src.src_tensor exists, then use the src.src_tensor to create the
1036 * required SrcSharedTensor to avoid recursive dependency.
1038 * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
1039 * if the batch size of src is updated and needs reallocation.
1041 dest.data = nullptr;
1043 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1045 } else if (!src.src_tensor)
1046 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1048 dest.src_tensor = std::make_shared<SrcSharedTensor>(
1049 src.src_tensor->tensor(), offset + src.src_tensor->offset());
1052 Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
1054 const std::string &name_) const {
1056 if (dim_.getFormat() != ret.dim.getFormat())
1057 throw std::invalid_argument("Tensor format does not match");
1063 if (dim_.getDataLen() + offset > dim.getDataLen())
1064 throw std::invalid_argument(
1065 "Creating shared tensor of size bigger than tensor memory.");
1068 ret.strides = ret.dim.computeStrides();
1070 TensorDim new_match_dim = dim_;
1071 new_match_dim.batch(dim.batch());
1072 if (new_match_dim != dim && !reset_stride)
1073 ret.contiguous = false;
1076 * In this case, its the caller's responsibility to ensure that allocate() is
1077 * called for the output tensor before operating on the output tensor.
1079 createSharedDataTensor(*this, ret, offset);
1084 std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
1085 NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1086 << "num size cannot be zero";
1092 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1093 << "cannot split axis of axis: " << axis;
1095 NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
1096 << "axis is not divisible by num_size, axis: " << axis
1097 << " num size: " << num_size;
1099 std::vector<size_t> sizes;
1100 sizes.resize(num_size);
1102 unsigned int sz = dim.getTensorDim(axis) / num_size;
1103 std::fill(sizes.begin(), sizes.end(), sz);
1105 return split(sizes, axis);
1108 std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
1109 size_t num_size = sizes.size();
1111 NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1112 << "num size cannot be zero";
1118 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1119 << "cannot split axis of axis: " << axis;
1122 std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
1123 std::invalid_argument)
1124 << "among given sizes at least one of size is 0";
1126 size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
1127 NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
1128 << "given sum of sizes did not match with origin tensor dim, tensor dim: "
1129 << dim.getTensorDim(axis) << " total size: " << total_size;
1131 std::vector<TensorDim> ret_dims;
1132 ret_dims.reserve(num_size);
1133 for (unsigned int i = 0; i < num_size; ++i) {
1135 ret_dims[i].setTensorDim(axis, sizes[i]);
1138 bool is_format_nchw = (dim.getFormat() == Tformat::NCHW);
1140 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1141 auto iter_value = [this, is_format_nchw](
1142 std::array<size_t, 4> &loc,
1143 const std::array<size_t, 4> &end_loc,
1144 const std::array<size_t, 4> &reset_dim_arr) -> float & {
1145 auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
1146 : getValue(loc[0], loc[3], loc[1], loc[2]);
1147 for (int i = 3; i >= 0; --i) {
1149 if (loc[i] == end_loc[i]) {
1150 loc[i] -= reset_dim_arr[i];
1158 std::vector<Tensor> ret;
1159 ret.reserve(num_size);
1161 unsigned int accumulated_size = 0;
1162 for (unsigned int i = 0; i < num_size; ++i) {
1163 std::array<size_t, 4> loc = {0, 0, 0, 0};
1165 if (is_format_nchw) {
1166 loc[axis] += accumulated_size;
1169 loc[0] += accumulated_size;
1170 } else if (axis == 1) {
1171 loc[3] += accumulated_size;
1172 } else if (axis == 2 || axis == 3) {
1173 loc[axis - 1] += accumulated_size;
1177 ret.emplace_back(ret_dims[i]);
1178 auto &ret_t = ret.back();
1180 std::array<size_t, 4> end_loc;
1182 if (is_format_nchw) {
1183 end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1184 ret_dims[i].height(), ret_dims[i].width()};
1186 end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1187 ret_dims[i].width(), ret_dims[i].channel()};
1190 accumulated_size += sizes[i];
1192 if (is_format_nchw) {
1193 end_loc[axis] = accumulated_size;
1196 end_loc[0] = accumulated_size;
1197 } else if (axis == 1) {
1198 end_loc[3] = accumulated_size;
1199 } else if (axis == 2 || axis == 3) {
1200 end_loc[axis - 1] = accumulated_size;
1204 std::array<size_t, 4> reset_dim_arr;
1205 if (is_format_nchw) {
1206 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1207 ret_dims[i].height(), ret_dims[i].width()};
1209 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1210 ret_dims[i].width(), ret_dims[i].channel()};
1213 ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1214 return iter_value(loc, end_loc, reset_dim_arr);
1220 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1222 [this, is_format_nchw](
1223 std::array<size_t, 4> &loc, const std::array<size_t, 4> &end_loc,
1224 const std::array<size_t, 4> &reset_dim_arr) -> __fp16 & {
1225 auto &value = (is_format_nchw)
1226 ? getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1227 : getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1228 for (int i = 3; i >= 0; --i) {
1230 if (loc[i] == end_loc[i]) {
1231 loc[i] -= reset_dim_arr[i];
1239 std::vector<Tensor> ret;
1240 ret.reserve(num_size);
1242 unsigned int accumulated_size = 0;
1243 for (unsigned int i = 0; i < num_size; ++i) {
1244 std::array<size_t, 4> loc = {0, 0, 0, 0};
1246 if (is_format_nchw) {
1247 loc[axis] += accumulated_size;
1250 loc[0] += accumulated_size;
1251 } else if (axis == 1) {
1252 loc[3] += accumulated_size;
1253 } else if (axis == 2 || axis == 3) {
1254 loc[axis - 1] += accumulated_size;
1258 ret.emplace_back(ret_dims[i]);
1259 auto &ret_t = ret.back();
1261 std::array<size_t, 4> end_loc;
1263 if (is_format_nchw) {
1264 end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1265 ret_dims[i].height(), ret_dims[i].width()};
1267 end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1268 ret_dims[i].width(), ret_dims[i].channel()};
1271 accumulated_size += sizes[i];
1273 if (is_format_nchw) {
1274 end_loc[axis] = accumulated_size;
1277 end_loc[0] = accumulated_size;
1278 } else if (axis == 1) {
1279 end_loc[3] = accumulated_size;
1280 } else if (axis == 2 || axis == 3) {
1281 end_loc[axis - 1] = accumulated_size;
1285 std::array<size_t, 4> reset_dim_arr;
1286 if (is_format_nchw) {
1287 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1288 ret_dims[i].height(), ret_dims[i].width()};
1290 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1291 ret_dims[i].width(), ret_dims[i].channel()};
1294 ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1295 return iter_value(loc, end_loc, reset_dim_arr);
1303 Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
1309 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1310 << "cannot split axis of axis: " << axis;
1312 NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
1313 << "given tensor vector is empty";
1315 auto ref_dim = tensors.front().getDim();
1316 bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
1317 ref_dim.setTensorDim(axis, 1);
1318 NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
1319 [&ref_dim, axis](const Tensor &t) {
1320 auto cur_dim = t.getDim();
1321 cur_dim.setTensorDim(axis, 1);
1322 return ref_dim == cur_dim;
1324 std::invalid_argument)
1325 << " all tensor must have the same dimension except for the axis, ref_dim: "
1326 << ref_dim << " axis : " << axis;
1328 auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
1329 [axis](unsigned cur, const Tensor &t) {
1330 return cur += t.getDim().getTensorDim(axis);
1332 if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1334 [is_format_nchw](std::array<unsigned, 4> &loc,
1335 const std::array<unsigned, 4> &start_loc, Tensor &t,
1336 const std::array<unsigned, 4> &ref_dim_arr) -> float & {
1337 auto &value = is_format_nchw
1338 ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
1339 : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
1341 for (int i = 3; i >= 0; --i) {
1343 if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1344 loc[i] = start_loc[i];
1352 auto ret_dim = ref_dim;
1353 ret_dim.setTensorDim(axis, axis_dim);
1355 auto ret = Tensor(ret_dim);
1357 std::array<unsigned, 4> loc = {0, 0, 0, 0};
1358 for (auto &t : tensors) {
1359 std::array<unsigned, 4> start_loc = loc;
1360 std::array<unsigned, 4> tensor_dim_arr;
1361 if (is_format_nchw) {
1362 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1363 tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1364 tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1365 tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1367 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1368 tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1369 tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1370 tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1373 for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1374 iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
1377 if (is_format_nchw) {
1378 loc[axis] += t.getDim().getTensorDim(axis);
1381 loc[0] += t.getDim().getTensorDim(axis);
1382 } else if (axis == 1) {
1383 loc[3] += t.getDim().getTensorDim(axis);
1384 } else if (axis == 2 || axis == 3) {
1385 loc[axis - 1] += t.getDim().getTensorDim(axis);
1391 } else if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1393 [is_format_nchw](std::array<unsigned, 4> &loc,
1394 const std::array<unsigned, 4> &start_loc, Tensor &t,
1395 const std::array<unsigned, 4> &ref_dim_arr) -> __fp16 & {
1396 auto &value = is_format_nchw
1397 ? t.getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1398 : t.getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1400 for (int i = 3; i >= 0; --i) {
1402 if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1403 loc[i] = start_loc[i];
1411 auto ret_dim = ref_dim;
1412 ret_dim.setTensorDim(axis, axis_dim);
1414 auto ret = Tensor(ret_dim);
1416 std::array<unsigned, 4> loc = {0, 0, 0, 0};
1417 for (auto &t : tensors) {
1418 std::array<unsigned, 4> start_loc = loc;
1419 std::array<unsigned, 4> tensor_dim_arr;
1420 if (is_format_nchw) {
1421 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1422 tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1423 tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1424 tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1426 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1427 tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1428 tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1429 tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1432 for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1433 iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<__fp16>(i);
1436 if (is_format_nchw) {
1437 loc[axis] += t.getDim().getTensorDim(axis);
1440 loc[0] += t.getDim().getTensorDim(axis);
1441 } else if (axis == 1) {
1442 loc[3] += t.getDim().getTensorDim(axis);
1443 } else if (axis == 2 || axis == 3) {
1444 loc[axis - 1] += t.getDim().getTensorDim(axis);
1453 void Tensor::makeSharedDataTensor(const Tensor &src, size_t offset) {
1454 if (strides != src.strides)
1455 throw std::invalid_argument(
1456 "Creating shared tensor of different stride than source tensor.");
1458 if (getDim().getDataLen() + offset > src.getDim().getDataLen())
1459 throw std::invalid_argument(
1460 "Creating shared tensor of different size or stride than source tensor.");
1463 * In this case, its the caller's responsibility to ensure that allocate() is
1464 * called for the output tensor before operating on the output tensor.
1466 createSharedDataTensor(src, *this, offset);
1469 void Tensor::apply_broadcast(
1471 std::function<void(const BroadcastInfo &e, const float *, const float *,
1474 Tensor &output) const {
1475 CREATE_IF_EMPTY_DIMS(output, dim);
1477 NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
1478 << getName() << " is not allocated";
1479 NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
1480 << m.getName() << " is not allocated";
1481 NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
1482 << output.getName() << " is not allocated";
1484 /// shortcut to cover when dimension matches
1485 /// note that buffer_size, the last stride is only used in v_func but it
1486 /// might be changed
1489 e.buffer_size = size();
1491 e.tensor_type = getTensorType();
1492 v_func(e, getData(), m.getData(), output.getData());
1496 return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1499 void Tensor::apply_broadcast(
1501 std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1504 Tensor &output) const {
1505 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
1507 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
1508 << getName() << " is not allocated";
1509 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
1510 << m.getName() << " is not allocated";
1511 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
1512 << output.getName() << " is not allocated";
1514 /// shortcut to cover when dimension matches
1515 /// note that buffer_size, the last stride is only used in v_func but it
1516 /// might be changed
1519 e.buffer_size = size();
1521 v_func(e, getData<__fp16>(), m.getData<__fp16>(), output.getData<__fp16>());
1525 return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1528 void Tensor::apply_broadcast_util(
1530 std::function<void(const BroadcastInfo &e, const float *, const float *,
1533 Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1534 size_t m_offset) const {
1536 const float *buf = this->getData();
1537 const float *m_buf = m.getData();
1538 float *out_buf = output.getData();
1540 if (e.buffer_axis == cur_axis) {
1541 v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1546 uint continuity[4] = {0, 1, 2, 3};
1547 if (getFormat() == Tformat::NHWC) {
1552 for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
1553 size_t next_offset = offset + i * strides[cur_axis];
1554 size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1555 apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1560 void Tensor::apply_broadcast_util(
1562 std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1565 Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1566 size_t m_offset) const {
1568 const __fp16 *buf = this->getData<__fp16>();
1569 const __fp16 *m_buf = m.getData<__fp16>();
1570 __fp16 *out_buf = output.getData<__fp16>();
1572 if (e.buffer_axis == cur_axis) {
1573 v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1578 for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
1579 size_t next_offset = offset + i * strides[cur_axis];
1580 size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1581 apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1587 * This is to sum the Tensor data according to the dim.batch().
1588 * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
1590 Tensor Tensor::sum_by_batch() const {
1591 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1592 << getName() << " is not contiguous, cannot sum";
1594 Tensor ret(dim.batch(), 1, 1, 1, this->getFormat(), getDataType());
1595 size_t feat_len = dim.getFeatureLen();
1596 size_t batch = dim.batch();
1598 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1599 const float *data = getData();
1600 float *rdata = ret.getData();
1602 Tensor ones(1, 1, 1, feat_len, this->getFormat());
1604 sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1605 ones.getData<float>(), 1, 0.0, rdata, 1);
1606 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1607 const __fp16 *data = getData<__fp16>();
1608 __fp16 *rdata = ret.getData<__fp16>();
1610 Tensor ones(1, 1, 1, feat_len, this->getTensorType());
1611 ones.setValue((__fp16)1.0);
1612 sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1613 ones.getData<__fp16>(), 1, 0.0, rdata, 1);
1620 * @brief Calculate sum according to the axis.
1622 Tensor Tensor::sum(unsigned int axis, float alpha) const {
1623 Tensor ret("", this->getFormat(), this->getDataType());
1624 return sum(axis, ret, alpha, 0);
1627 Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
1630 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1631 const float *data = getData<float>();
1633 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1634 << getName() << " is not contiguous, cannot sum";
1637 throw std::out_of_range("Error: axis is invalid");
1639 if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1640 CREATE_IF_EMPTY_DIMS(ret, dim);
1641 ret.copy(this->getData());
1647 CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1648 this->getTensorType());
1649 size_t feat_len = dim.getFeatureLen();
1650 size_t batch = dim.batch();
1651 Tensor ones(1, 1, 1, batch, this->getFormat());
1652 ones.setValue(alpha);
1653 sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1654 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1657 CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1658 if (this->getFormat() == Tformat::NHWC) {
1659 unsigned int m = ret.dim.getDataLen();
1660 unsigned int n = dim[1];
1661 Tensor ones(1, 1, 1, n, this->getTensorType());
1662 ones.setValue(alpha);
1663 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1664 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1666 unsigned int feat_len = dim[2] * dim[3];
1667 unsigned int t_axis = dim[1];
1668 Tensor ones(1, 1, 1, t_axis, getTensorType());
1669 ones.setValue(alpha);
1670 float *rdata = ret.getData<float>();
1671 for (unsigned int k = 0; k < dim[0]; ++k) {
1672 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1673 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1674 1, beta, &rdata[k * feat_len], 1);
1679 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1681 if (this->getFormat() == Tformat::NHWC) {
1682 unsigned int feat_len = dim[1] * dim[3];
1683 unsigned int t_axis = dim[2];
1684 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1685 ones.setValue(alpha);
1686 float *rdata = ret.getData<float>();
1687 for (unsigned int k = 0; k < dim[0]; ++k) {
1688 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1689 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1690 1, beta, &rdata[k * feat_len], 1);
1693 unsigned int t_3 = dim[3];
1694 unsigned int t_axis = dim[2];
1695 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1696 ones.setValue(alpha);
1697 float *rdata = ret.getData<float>();
1698 for (unsigned int k = 0; k < dim[0]; ++k) {
1699 for (unsigned int c = 0; c < dim[1]; ++c) {
1700 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1701 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1702 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1703 ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1709 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
1710 this->getTensorType());
1711 if (this->getFormat() == Tformat::NHWC) {
1712 unsigned int t_3 = dim[1];
1713 unsigned int t_axis = dim[3];
1714 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1715 ones.setValue(alpha);
1716 float *rdata = ret.getData<float>();
1717 for (unsigned int k = 0; k < dim[0]; ++k) {
1718 for (unsigned int c = 0; c < dim[2]; ++c) {
1719 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1720 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1721 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1722 ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1726 unsigned int m = ret.dim.getDataLen();
1727 unsigned int n = dim[3];
1728 Tensor ones(1, 1, 1, n);
1729 ones.setValue(alpha);
1730 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1731 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1735 throw std::out_of_range("Error: Dimension cannot exceed 3");
1738 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1739 const __fp16 *data = getData<__fp16>();
1741 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1742 << getName() << " is not contiguous, cannot sum";
1745 throw std::out_of_range("Error: axis is invalid");
1747 if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1748 CREATE_IF_EMPTY_DIMS(ret, dim);
1749 ret.copy(this->getData<__fp16>());
1755 CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1756 this->getTensorType());
1757 size_t feat_len = dim.getFeatureLen();
1758 size_t batch = dim.batch();
1759 Tensor ones(1, 1, 1, batch, this->getTensorType());
1760 ones.setValue(alpha);
1761 sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1762 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1765 CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1766 if (this->getFormat() == Tformat::NHWC) {
1767 unsigned int m = ret.dim.getDataLen();
1768 unsigned int n = dim[1];
1769 Tensor ones(1, 1, 1, n, this->getTensorType());
1770 ones.setValue(alpha);
1771 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1772 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1774 unsigned int feat_len = dim[2] * dim[3];
1775 unsigned int t_axis = dim[1];
1776 Tensor ones(1, 1, 1, t_axis, getTensorType());
1777 ones.setValue(alpha);
1778 __fp16 *rdata = ret.getData<__fp16>();
1779 for (unsigned int k = 0; k < dim[0]; ++k) {
1780 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1781 &data[k * dim.getFeatureLen()], feat_len,
1782 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1787 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1789 if (this->getFormat() == Tformat::NHWC) {
1790 unsigned int feat_len = dim[1] * dim[3];
1791 unsigned int t_axis = dim[2];
1792 Tensor ones(1, 1, 1, t_axis, getTensorType());
1793 ones.setValue(alpha);
1794 __fp16 *rdata = ret.getData<__fp16>();
1795 for (unsigned int k = 0; k < dim[0]; ++k) {
1796 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1797 &data[k * dim.getFeatureLen()], feat_len,
1798 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1801 unsigned int t_3 = dim[3];
1802 unsigned int t_axis = dim[2];
1803 Tensor ones(1, 1, 1, t_axis, getTensorType());
1804 ones.setValue(alpha);
1805 __fp16 *rdata = ret.getData<__fp16>();
1806 for (unsigned int k = 0; k < dim[0]; ++k) {
1807 for (unsigned int c = 0; c < dim[1]; ++c) {
1808 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1809 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1810 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1811 ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1817 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1, getTensorType());
1818 if (this->getFormat() == Tformat::NHWC) {
1819 unsigned int t_3 = dim[1];
1820 unsigned int t_axis = dim[3];
1821 Tensor ones(1, 1, 1, t_axis, getTensorType());
1822 ones.setValue(alpha);
1823 __fp16 *rdata = ret.getData<__fp16>();
1824 for (unsigned int k = 0; k < dim[0]; ++k) {
1825 for (unsigned int c = 0; c < dim[2]; ++c) {
1826 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1827 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1828 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1829 ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1833 unsigned int m = ret.dim.getDataLen();
1834 unsigned int n = dim[3];
1835 Tensor ones(1, 1, 1, n, getTensorType());
1836 ones.setValue(alpha);
1837 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1838 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1842 throw std::out_of_range("Error: Dimension cannot exceed 3");
1848 Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
1849 Tensor ret("", this->getFormat());
1850 return sum(axes, ret, alpha);
1853 void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
1854 std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1855 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1856 << getName() << " is not contiguous, cannot merge axis";
1858 if (axis2 != axis1 + 1)
1859 if (!checkContinuous(axis1, axis2))
1860 throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
1862 dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
1863 dim.setTensorDim(axis1, 1);
1866 Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
1867 float alpha) const {
1869 throw std::invalid_argument("empty axes given");
1871 if (axes.size() == 1) {
1872 this->sum(axes[0], output, alpha);
1874 /** club axes together */
1875 Tensor new_reshaped = *this;
1876 std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1877 std::vector<unsigned int> new_axes = {axes[0]};
1879 for (unsigned int i = 1; i < axes.size(); ++i) {
1880 if (checkContinuous(axes[i - 1], axes[i])) {
1881 new_reshaped.mergeAxis(axes[i - 1], axes[i]);
1882 new_axes.back() = axes[i];
1884 new_axes.push_back(axes[i]);
1888 Tensor ret = new_reshaped.sum(new_axes[0]);
1889 for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
1890 ret = ret.sum(axes[i]);
1891 ret.sum(new_axes.back(), output, alpha);
1897 Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
1898 bool trans_m, float beta) const {
1899 if (!result.isAllocated())
1900 throw std::invalid_argument(
1901 "Output tensor must be preallocated for dotBatched operation");
1902 for (unsigned int b = 0; b < batch(); b++) {
1903 /** @todo try using transpose to speedup the operation */
1904 const Tensor this_b = this->getBatchSlice(b, 1);
1905 Tensor m_b = m.getBatchSlice(b, 1);
1906 Tensor result_b = result.getBatchSlice(b, 1);
1908 this_b.dot(m_b, result_b, trans, trans_m, beta);
1914 Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
1915 Tensor output("", this->getFormat(), this->getDataType());
1916 dot(m, output, trans, trans_m);
1921 * @brief compute the derivative of this in the current tensor
1922 * @todo will have to see if beta effects this computation
1924 Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
1925 bool trans, bool trans_m, float beta) {
1926 bool deriv_trans_m = true;
1927 bool deriv_trans = false;
1928 /** @todo handle all cases of trans and trans_m */
1929 if (!trans && trans_m) {
1930 deriv_trans_m = false;
1933 return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
1937 * @brief compute the derivative wrt m in the m tensor
1938 * @note The caller tensor must be the same tensor as the one which called the
1941 Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
1942 bool trans, bool trans_m, float beta) const {
1943 bool deriv_trans_m = false;
1944 bool deriv_trans = true;
1945 /** @todo handle all cases of trans and trans_m */
1947 if (!trans && trans_m) {
1948 output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1951 return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1955 Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
1956 Tensor const &output_deriv, bool trans,
1957 bool trans_m, float beta) {
1958 bool deriv_trans_m = true;
1959 bool deriv_trans = false;
1960 /** @todo handle all cases of trans and trans_m */
1961 if (!trans && trans_m) {
1962 deriv_trans_m = false;
1965 return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
1968 Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
1969 Tensor const &output_deriv, bool trans,
1970 bool trans_m, float beta) const {
1971 bool deriv_trans_m = false;
1972 bool deriv_trans = true;
1973 /** @todo handle all cases of trans and trans_m */
1975 if (!trans && trans_m) {
1976 output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1979 return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1984 * @note: This dot product flattens the fist 3 axis for the purpose of
1985 * computation. So, while performing, these matrices are behaving as 2-D
1986 * matrices. The dimensions are restored while returning back the tensor
1987 * in case of trans is false.
1989 Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
1991 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1992 << getName() << " is not contiguous. Cannot dot product.";
1994 // Comment out with intension to support the calculation wrt. batch and height
1995 // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
1996 // [BxCxH,W] as well if (m.dim.rank() > 2) {
1997 // throw exception::not_supported("Error: support only for rank of dot "
2001 // Comment out with intension to support the calculation wrt. batch and height
2002 // direction of this tensor. It is OK as long as m is 2D
2004 if (trans && dim.rank() > 2) {
2005 ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
2007 unsigned int dim1, dim2, mdim1, mdim2;
2008 if (getFormat() == Tformat::NHWC) {
2009 dim1 = batch() * height() * width();
2011 mdim1 = m.batch() * m.height() * m.width();
2012 mdim2 = m.channel();
2014 dim1 = batch() * channel() * height();
2016 mdim1 = m.batch() * m.channel() * m.height();
2020 unsigned int M, N, K, lda, ldb, ldc;
2022 if (!trans && !trans_m) {
2024 throw std::runtime_error(
2025 "Error: incompatible dimensions for dot product");
2026 K = mdim1; /** == dim2 */
2029 if (getFormat() == Tformat::NHWC) {
2030 CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2031 getTensorType()); // NHWC Result Tensor
2033 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2037 // We are not set zero the result because of performance reason.
2038 // However, result is not initialized properly. There might include
2039 // garbage like nan. When we have to use this value as in C = alpha*A*B +
2040 // beta*C, then have to check garbage data of C is not effect or not.
2042 } else if (!trans && trans_m) {
2044 throw std::runtime_error(
2045 "Error: incompatible dimensions for dot product");
2046 K = mdim2; /** == dim2 */
2049 if (getFormat() == Tformat::NHWC) {
2050 CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2053 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2055 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2057 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2060 } else if (trans && !trans_m) {
2062 throw std::runtime_error(
2063 "Error: incompatible dimensions for dot product");
2064 K = mdim1; /** == dim1 */
2067 if (getFormat() == Tformat::NHWC) {
2068 CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2070 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2074 throw std::runtime_error(
2075 "Error: incompatible dimensions for dot product");
2076 K = mdim2; /** == dim1 */
2079 if (getFormat() == Tformat::NHWC) {
2080 CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2082 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2087 ldc = (getFormat() == Tformat::NHWC) ? result.channel() : result.width();
2089 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2090 const float *data = getData();
2091 const float *mdata = m.getData();
2092 float *rdata = result.getData();
2093 const float alpha = 1.0f;
2094 enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2095 enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2097 /// shortcut handling in case of vector
2098 /// for vector, (1 * K) == (K * 1) in current memory layout...
2099 /// and plaese note that N, K, M is a fixed place holder after considering
2101 /// For example, there is no case like (1 * K) X (1 * K) while
2102 /// (1 * K) X (1 * M) can be a case
2103 /// case1: (1 * K) X (K * 1)
2104 if (M == 1 && N == 1) {
2105 *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2107 /// case2: (M * K) X (K * 1)
2109 sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2112 /// case3: (1 * K) X (K * N) = 1 * N = R
2113 /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2114 /// Effectively a translation of sgemv
2116 transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2117 sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2120 /// case others: use gemm
2122 sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2123 ldb, beta, rdata, ldc);
2125 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2126 const __fp16 *data = getData<__fp16>();
2127 const __fp16 *mdata = m.getData<__fp16>();
2128 __fp16 *rdata = result.getData<__fp16>();
2129 const float alpha = 1.0f;
2130 enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2131 enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2133 /// shortcut handling in case of vector
2134 /// for vector, (1 * K) == (K * 1) in current memory layout...
2135 /// and plaese note that N, K, M is a fixed place holder after considering
2137 /// For example, there is no case like (1 * K) X (1 * K) while
2138 /// (1 * K) X (1 * M) can be a case
2139 /// case1: (1 * K) X (K * 1)
2140 if (M == 1 && N == 1) {
2141 *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2143 /// case2: (M * K) X (K * 1)
2145 sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2148 /// case3: (1 * K) X (K * N) = 1 * N = R
2149 /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2150 /// Effectively a translation of sgemv
2152 transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2153 sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2156 /// case others: use sgemm
2158 sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2159 ldb, beta, rdata, ldc);
2166 Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
2167 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2168 << getName() << " is not contiguous. Cannot transpose.";
2170 if (out.getData() == getData()) {
2171 Tensor tmp = clone();
2172 return tmp.transpose(direction, out);
2175 unsigned int SL, SI, SJ, SK;
2177 out.reshape(dim.transpose(direction));
2179 int indexI = direction[0] - '0';
2180 int indexJ = direction[2] - '0';
2182 SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
2184 bool is_format_nchw = (getFormat() == Tformat::NCHW);
2186 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2187 const float *inptr = getData();
2188 float *outptr = out.getData();
2192 if (is_format_nchw) {
2193 transposeloop(l, i, j, k, SL, SI, SJ, SK);
2195 transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2198 if (is_format_nchw) {
2199 transposeloop(l, i, k, j, SL, SI, SK, SJ);
2201 transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2207 if (is_format_nchw) {
2208 transposeloop(l, j, i, k, SL, SJ, SI, SK);
2210 transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2213 if (is_format_nchw) {
2214 transposeloop(l, j, k, i, SL, SJ, SK, SI);
2216 transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2222 if (is_format_nchw) {
2223 transposeloop(l, k, i, j, SL, SK, SI, SJ);
2225 transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2228 if (is_format_nchw) {
2229 transposeloop(l, k, j, i, SL, SK, SJ, SI);
2231 transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2237 const __fp16 *inptr = getData<__fp16>();
2238 __fp16 *outptr = out.getData<__fp16>();
2242 if (is_format_nchw) {
2243 transposeloop(l, i, j, k, SL, SI, SJ, SK);
2245 transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2248 if (is_format_nchw) {
2249 transposeloop(l, i, k, j, SL, SI, SK, SJ);
2251 transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2257 if (is_format_nchw) {
2258 transposeloop(l, j, i, k, SL, SJ, SI, SK);
2260 transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2263 if (is_format_nchw) {
2264 transposeloop(l, j, k, i, SL, SJ, SK, SI);
2266 transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2272 if (is_format_nchw) {
2273 transposeloop(l, k, i, j, SL, SK, SI, SJ);
2275 transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2278 if (is_format_nchw) {
2279 transposeloop(l, k, j, i, SL, SK, SJ, SI);
2281 transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2291 Tensor Tensor::transpose(const std::string &direction) const {
2293 transpose(direction, result);
2297 Tensor Tensor::dropout_mask(float dropout) const {
2299 result.dropout_mask(dropout);
2303 void Tensor::dropout_mask(float dropout) {
2304 setRandUniform(0.0, 1.0);
2305 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2306 float scale = 1.0 / (1 - dropout);
2307 float *data_ = getData();
2308 for (unsigned int i = 0; i < size(); ++i) {
2309 if (data_[i] >= dropout)
2314 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2315 __fp16 scale = 1.0 / (1 - dropout);
2316 __fp16 *data_ = getData<__fp16>();
2317 for (unsigned int i = 0; i < size(); ++i) {
2318 if (data_[i] >= dropout)
2326 void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
2327 float fill_mask_val = 0.0;
2328 float en_mask_val = 1.0 - fill_mask_val;
2331 fill_mask_val = 1.0;
2332 en_mask_val = 1.0 - fill_mask_val;
2335 setValue(fill_mask_val);
2336 if (mask_len.batch() != batch())
2337 throw std::invalid_argument("Number of filter masks mismatched");
2338 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2339 for (unsigned int b = 0; b < batch(); b++) {
2340 float *addr = getAddress(b, 0, 0, 0);
2341 const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2342 std::fill(addr, addr + (*mask_len_val), en_mask_val);
2344 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2345 for (unsigned int b = 0; b < batch(); b++) {
2346 __fp16 *addr = getAddress<__fp16>(b, 0, 0, 0);
2347 const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2348 std::fill(addr, addr + (*mask_len_val), (__fp16)en_mask_val);
2353 Tensor Tensor::zoneout_mask(float zoneout) {
2354 Tensor ret(getDim());
2355 zoneout_mask(ret, zoneout);
2359 void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
2360 if (dim != opposite.dim) {
2361 throw std::invalid_argument(
2362 "[Tensor::zoneout_mask] opposite dimension does not match");
2365 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2366 opposite.setRandBernoulli(zoneout);
2368 float *data = getData();
2369 float *opposite_data = opposite.getData();
2371 for (unsigned int i = 0; i < size(); ++i) {
2372 if (opposite_data[i] > epsilon) {
2378 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2379 __fp16 zoneout_fp16 = (__fp16)zoneout;
2380 opposite.setRandBernoulli(zoneout_fp16);
2382 __fp16 *data = getData<__fp16>();
2383 __fp16 *opposite_data = opposite.getData<__fp16>();
2385 for (unsigned int i = 0; i < size(); ++i) {
2386 if (opposite_data[i] > epsilon) {
2387 data[i] = (__fp16)0.0;
2389 data[i] = (__fp16)1.0;
2395 // int Tensor::apply_i(std::function<float(float)> f) {
2396 // Tensor result = *this;
2397 // apply(f, result);
2399 // return ML_ERROR_NONE;
2402 // Tensor Tensor::apply(std::function<float(float)> f) const {
2404 // return apply(f, result);
2407 // Tensor &Tensor::apply(std::function<float(float)> f, Tensor &output) const {
2408 // CREATE_IF_EMPTY_DIMS(output, dim);
2410 // if (dim != output.dim) {
2411 // /// @todo add unittest
2412 // throw std::invalid_argument(
2413 // "[Tensor::apply] output dimension does not match");
2416 // if (contiguous && output.contiguous) {
2417 // const float *data = getData();
2418 // float *rdata = output.getData();
2419 // std::transform(data, data + size(), rdata, f);
2420 // } else if (strides[3] == 1 && output.strides[3] == 1) {
2421 // /** @todo optimize this with combining these loops where stride is 1 */
2422 // for (unsigned int b = 0; b < batch(); ++b) {
2423 // for (unsigned int c = 0; c < channel(); ++c) {
2424 // for (unsigned int h = 0; h < height(); ++h) {
2425 // float *out_data = output.getAddress(b, c, h, 0);
2426 // const float *in_data = getAddress(b, c, h, 0);
2427 // std::transform(in_data, in_data + width(), out_data, f);
2432 // for (unsigned int b = 0; b < batch(); ++b) {
2433 // for (unsigned int c = 0; c < channel(); ++c) {
2434 // for (unsigned int h = 0; h < height(); ++h) {
2435 // for (unsigned int w = 0; w < width(); ++w) {
2436 // output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
2446 Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
2448 Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
2449 Tensor &output) const {
2450 return f(*this, output);
2453 void Tensor::print(std::ostream &out) const {
2454 printInstance(out, this);
2455 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2456 const float *data = getData<float>();
2457 unsigned int len = size();
2458 out << "data addr: " << data << '\n';
2462 out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2463 << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2464 << ']' << std::endl;
2468 std::ios init(NULL);
2470 if (getFormat() == Tformat::NCHW) {
2471 for (unsigned int k = 0; k < batch(); k++) {
2472 for (unsigned int l = 0; l < channel(); l++) {
2473 for (unsigned int i = 0; i < height(); i++) {
2474 for (unsigned int j = 0; j < width(); j++) {
2475 out << std::setw(10) << std::setprecision(10)
2476 << this->getValue<float>(k, l, i, j) << " ";
2482 out << "-------" << std::endl;
2485 for (unsigned int k = 0; k < batch(); k++) {
2486 for (unsigned int i = 0; i < height(); i++) {
2487 for (unsigned int j = 0; j < width(); j++) {
2488 for (unsigned int l = 0; l < channel(); l++) {
2489 out << std::setw(10) << std::setprecision(10)
2490 << this->getValue<float>(k, l, i, j) << " ";
2496 out << "-------" << std::endl;
2500 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2501 const __fp16 *data = getData<__fp16>();
2502 unsigned int len = size();
2503 out << "data addr: " << data << '\n';
2507 out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2508 << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2509 << ']' << std::endl;
2513 std::ios init(NULL);
2515 if (getFormat() == Tformat::NCHW) {
2516 for (unsigned int k = 0; k < batch(); k++) {
2517 for (unsigned int l = 0; l < channel(); l++) {
2518 for (unsigned int i = 0; i < height(); i++) {
2519 for (unsigned int j = 0; j < width(); j++) {
2520 out << std::setw(10) << std::setprecision(10)
2521 << this->getValue<__fp16>(k, l, i, j) << " ";
2527 out << "-------" << std::endl;
2530 for (unsigned int k = 0; k < batch(); k++) {
2531 for (unsigned int i = 0; i < height(); i++) {
2532 for (unsigned int j = 0; j < width(); j++) {
2533 for (unsigned int l = 0; l < channel(); l++) {
2534 out << std::setw(10) << std::setprecision(10)
2535 << this->getValue<__fp16>(k, l, i, j) << " ";
2541 out << "-------" << std::endl;
2548 void Tensor::print_(std::ostream &out, uint opt) const {
2549 printInstance(out, this);
2550 const float *data = getData();
2552 unsigned int len = size();
2554 std::ios init(NULL);
2557 if (getFormat() == Tformat::NCHW) {
2559 for (unsigned int k = 0; k < batch(); k++) {
2561 for (unsigned int i = 0; i < channel(); i++) {
2563 for (unsigned int j = 0; j < height(); j++) {
2565 for (unsigned int l = 0; l < width(); l++) {
2566 if (l < channel() - 1)
2567 out << std::setw(10) << std::setprecision(10)
2568 << this->getValue<float>(k, l, i, j) << ", ";
2570 out << std::setw(10) << std::setprecision(10)
2571 << this->getValue<float>(k, l, i, j);
2573 if (j < height() - 1)
2579 if (i < channel() - 1)
2585 if (k < batch() - 1)
2594 for (unsigned int k = 0; k < batch(); k++) {
2596 for (unsigned int i = 0; i < height(); i++) {
2598 for (unsigned int j = 0; j < width(); j++) {
2600 for (unsigned int l = 0; l < channel(); l++) {
2601 if (l < channel() - 1)
2602 out << std::setw(10) << std::setprecision(10)
2603 << this->getValue<float>(k, l, i, j) << ", ";
2605 out << std::setw(10) << std::setprecision(10)
2606 << this->getValue<float>(k, l, i, j);
2608 if (j < width() - 1)
2614 if (i < height() - 1)
2620 if (k < batch() - 1)
2629 for (uint i = 0; i < len; ++i) {
2630 out << getData<float>()[i] << ", ";
2635 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
2640 void Tensor::copy(const void *buf) {
2641 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2642 << getName() << "Tensor is not contiguous, cannot copy.";
2644 if (getDataType() == ml::train::TensorDim::DataType::FP16 &&
2645 buf == getData<__fp16>()) {
2647 } else if (getDataType() == ml::train::TensorDim::DataType::FP32 &&
2651 // std::string type_ =
2652 // (getDataType() == ml::train::TensorDim::DataType::FP16) ? "FP16" : "NO";
2653 // std::cout << type_ << std::endl;
2655 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2656 scopy(size(), (__fp16 *)buf, 1, getData<__fp16>(), 1);
2657 } else if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2658 scopy(size(), (float *)buf, 1, getData<float>(), 1);
2662 void Tensor::copy_with_stride(const Tensor &from) {
2664 if (dim == from.getDim()) {
2665 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2666 for (unsigned int b = 0; b < batch(); ++b) {
2667 for (unsigned int c = 0; c < channel(); ++c) {
2668 for (unsigned int h = 0; h < height(); ++h) {
2669 for (unsigned int w = 0; w < width(); ++w) {
2670 setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2676 for (unsigned int b = 0; b < batch(); ++b) {
2677 for (unsigned int c = 0; c < channel(); ++c) {
2678 for (unsigned int h = 0; h < height(); ++h) {
2679 for (unsigned int w = 0; w < width(); ++w) {
2680 setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2687 Tensor t = Tensor(from.getDim(), true);
2688 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2689 for (unsigned int b = 0; b < t.batch(); ++b) {
2690 for (unsigned int c = 0; c < t.channel(); ++c) {
2691 for (unsigned int h = 0; h < t.height(); ++h) {
2692 for (unsigned int w = 0; w < t.width(); ++w) {
2693 t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2699 for (unsigned int b = 0; b < batch(); ++b) {
2700 for (unsigned int c = 0; c < channel(); ++c) {
2701 for (unsigned int h = 0; h < height(); ++h) {
2702 for (unsigned int w = 0; w < width(); ++w) {
2703 setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2713 void Tensor::copy(const Tensor &from) {
2714 // todo: enable copy to non-contiguous tensor
2716 throw std::runtime_error("Cannot copy non-contiguous tensor");
2719 if (from.size() != 0 && size() == from.size() &&
2720 getDataType() == from.getDataType()) {
2721 reshape(from.getDim());
2722 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2723 copy(from.getData());
2724 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2725 copy(from.getData<__fp16>());
2729 Tensor t = Tensor(from.getDim(), from.getData());
2734 void Tensor::copyData(const Tensor &from) {
2735 // todo: enable copy to non-contiguous tensor
2737 throw std::runtime_error("Cannot copy non-contiguous tensor");
2740 if (size() != from.size())
2741 throw std::invalid_argument("Size of tensor to copy must match");
2743 if (getDataType() != from.getDataType())
2744 throw std::invalid_argument("Data type of tensor to copy must match");
2746 copy(from.getData());
2749 Tensor Tensor::clone() const {
2756 void Tensor::reshape(const TensorDim &d) {
2758 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2759 << getName() << " is not contiguous, cannot reshape.";
2761 NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
2762 << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
2764 << getDim() << " to " << d;
2767 strides = d.computeStrides();
2770 void Tensor::fill(const Tensor &from, bool alloc) {
2771 if (alloc && this->empty()) {
2776 if (!from.contiguous || !contiguous) {
2777 /// @todo enable this if needed
2778 throw nntrainer::exception::not_supported(
2779 "[Tensor::fill] non-contiguous tensors are not supported");
2782 if (dim != from.getDim()) {
2783 throw std::invalid_argument("[Tensor::fill] dimension must be the same");
2786 if (strides != from.getStrides()) {
2787 /// @todo length does not represent buffer size, there should be way to
2788 /// get the buffer size
2789 throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
2792 this->copy(from.getData());
2795 void Tensor::save(std::ostream &file) {
2796 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2797 << getName() << " is not contiguous, cannot save.";
2799 std::streamsize sz = static_cast<std::streamsize>(bytes());
2800 NNTR_THROW_IF(sz < 0, std::invalid_argument)
2801 << "save size: " << bytes()
2802 << " is too big. It cannot be represented by std::streamsize";
2804 checkedWrite(file, (char *)getData(), sz, "[Tensor::save] operation failed");
2808 void Tensor::read(std::ifstream &file) {
2809 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2810 << getName() << " is not contiguous, cannot read.";
2812 std::streamsize sz = static_cast<std::streamsize>(bytes());
2814 NNTR_THROW_IF(sz < 0, std::invalid_argument)
2815 << "read size: " << bytes()
2816 << " is too big. It cannot be represented by std::streamsize";
2818 checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
2823 * @brief Calculate average value according to the axis.
2825 Tensor Tensor::average(unsigned int axis) const {
2826 Tensor t("", this->getFormat(), this->getDataType());
2827 return average(axis, t);
2831 * @brief Calculate average value according to the axis.
2833 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
2834 if (axis >= TensorDim::MAXDIM)
2835 throw std::out_of_range(
2836 "negative axis or axis more then MAXDIM is invalid");
2838 unsigned int axis_size = dim.getDim()[axis];
2842 this->sum(axis, output, 1.0 / ((float)axis_size));
2847 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
2848 Tensor t("", this->getFormat(), this->getDataType());
2849 return average(axes, t);
2852 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
2853 Tensor &output) const {
2855 return this->average(output);
2857 TensorDim ret_shape(getTensorType());
2859 for (const auto &idx : axes) {
2860 if (idx >= TensorDim::MAXDIM) {
2861 throw std::out_of_range("axis more then MAXDIM is invalid");
2863 ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
2866 return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
2870 * @brief Calculate average value according to the axis.
2872 Tensor Tensor::average() const {
2873 Tensor result = *this;
2874 unsigned int axis = 0;
2875 if (this->getFormat() == Tformat::NHWC) {
2876 result.reshape({1, dim.getDataLen(), 1, 1, this->getTensorType()});
2879 result.reshape({1, 1, 1, dim.getDataLen(), this->getTensorType()});
2882 return result.average(axis);
2886 * @brief Calculate average value according to the axis.
2888 Tensor &Tensor::average(Tensor &output) const {
2889 Tensor result = *this;
2890 result.reshape({1, 1, 1, dim.getDataLen()});
2891 return result.average(3, output);
2894 void Tensor::setValue(float val) {
2895 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2896 << getName() << " is not contiguous, cannot set value.";
2897 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2898 float *data = getData<float>();
2899 std::fill(data, data + size(), val);
2900 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2901 __fp16 *data = getData<__fp16>();
2902 std::fill(data, data + size(), (__fp16)val);
2906 void Tensor::setZero() {
2907 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2909 sscal(size(), 0, getData<float>(), 1);
2911 apply_i([](float val) -> float { return 0; });
2912 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
2914 sscal(size(), 0, getData<__fp16>(), 1);
2916 apply_i([](__fp16 val) -> __fp16 { return 0; });
2920 std::vector<unsigned int> Tensor::argmax() const {
2921 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2922 << getName() << " is not contiguous, cannot get argmax.";
2923 std::vector<unsigned int> result;
2925 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2926 const float *data = getData();
2927 size_t batch_size = batch();
2928 size_t feature_len = dim.getFeatureLen();
2930 result.resize(batch_size);
2932 for (unsigned int b = 0; b < batch_size; b++) {
2934 std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
2935 result[b] = std::distance(data, max_iter) - (b * feature_len);
2938 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2939 const __fp16 *data = getData<__fp16>();
2940 size_t batch_size = batch();
2941 size_t feature_len = dim.getFeatureLen();
2943 result.resize(batch_size);
2945 for (unsigned int b = 0; b < batch_size; b++) {
2947 std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
2948 result[b] = std::distance(data, max_iter) - (b * feature_len);
2955 float Tensor::l2norm() const {
2956 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2957 << getName() << " is not contiguous, cannot get l2norm.";
2959 unsigned int len = size();
2960 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2961 const float *data = getData<float>();
2962 return snrm2(len, data, 1);
2963 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2964 const __fp16 *data = getData<__fp16>();
2965 return snrm2(len, data, 1);
2969 float Tensor::max_abs() const {
2970 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2971 << getName() << " is not contiguous, cannot get max_abs.";
2973 unsigned int len = size();
2974 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2975 const float *data = getData<float>();
2977 unsigned int idx = isamax(len, data, 1);
2978 return *(data + idx);
2980 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2981 const __fp16 *data = getData<__fp16>();
2983 unsigned int idx = isamax(len, data, 1);
2984 return *(data + idx);
2988 Tensor &Tensor::normalization(Tensor &output) const {
2990 output = Tensor(dim);
2993 output.normalization_i();
2998 void Tensor::normalization_i() {
2999 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3000 << getName() << " is not contiguous, cannot do normalization.";
3002 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3003 const float *data = getData();
3005 auto bounds = std::minmax_element(data, data + size());
3006 const float min = *bounds.first;
3007 const float max = *bounds.second;
3011 this->subtract_i(tmp);
3013 this->subtract_i(min);
3014 this->divide_i(max - min);
3016 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3017 const __fp16 *data = getData<__fp16>();
3019 auto bounds = std::minmax_element(data, data + size());
3020 const __fp16 min = *bounds.first;
3021 const __fp16 max = *bounds.second;
3025 this->subtract_i(tmp);
3027 this->subtract_i(min);
3028 this->divide_i(max - min);
3033 LazyTensor Tensor::chain() const { return LazyTensor(*this); }
3035 Tensor &Tensor::standardization(Tensor &output) const {
3037 output = Tensor(dim);
3040 output.standardization_i();
3045 void Tensor::standardization_i() {
3046 Tensor mean_by_batch = this->sum_by_batch();
3047 mean_by_batch.divide_i(dim.getFeatureLen());
3049 this->subtract_i(mean_by_batch);
3050 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3051 Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3052 std_dev_by_batch.setZero();
3053 float *std_dev = std_dev_by_batch.getData();
3055 for (unsigned int k = 0; k < dim.batch(); ++k) {
3056 Tensor sub_this = this->getBatchSlice(k, 1);
3057 std_dev[k] = sub_this.l2norm();
3060 std_dev_by_batch.divide_i(dim.getFeatureLen());
3061 this->divide_i(std_dev_by_batch);
3062 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3063 Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3064 std_dev_by_batch.setZero();
3065 __fp16 *std_dev = std_dev_by_batch.getData<__fp16>();
3067 for (unsigned int k = 0; k < dim.batch(); ++k) {
3068 Tensor sub_this = this->getBatchSlice(k, 1);
3069 std_dev[k] = sub_this.l2norm();
3072 std_dev_by_batch.divide_i(dim.getFeatureLen());
3073 this->divide_i(std_dev_by_batch);
3077 Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
3078 if (m.size() > this->size())
3079 throw exception::not_supported("broadcasting *this is not supported");
3081 const TensorDim m_dim = m.getDim();
3084 e.tensor_type = getTensorType();
3086 uint continuity[4] = {0, 1, 2, 3};
3087 if (getFormat() == Tformat::NHWC) {
3093 /// checking if given Tensor's can be broadcasted
3094 for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
3095 if (dim.getTensorDim(continuity[i]) == m_dim.getTensorDim(continuity[i])) {
3096 e.strides[i] = m.strides[i];
3100 /// If given dimension is 1, it could be reused, the stride remaining 0
3101 /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
3102 /// If so, strides should not change
3103 if (m_dim.getTensorDim(continuity[i]) == 1) {
3107 std::stringstream ss;
3108 ss << "[computeBroadcastInfo] broadcasting only allowed for "
3109 "dimension value of 1 \n"
3110 << "this: " << dim << "target: " << m_dim;
3111 throw std::invalid_argument(ss.str().c_str());
3114 /// calculate inner loop size
3117 e.strides[3] = m.strides[3];
3119 /// initiate buffer info with matching dimension strategy
3120 for (int axis = 3; axis >= 0; --axis) {
3121 if (dim.getTensorDim(continuity[axis]) !=
3122 m_dim.getTensorDim(continuity[axis])) {
3123 e.buffer_axis = axis;
3127 e.buffer_size *= dim.getTensorDim(continuity[axis]);
3130 /// check strategy that uses consecutive ones
3131 if (m_dim.getTensorDim(continuity[3]) == 1) {
3132 unsigned int inner_loop_size = 1;
3134 for (axis = 3; axis >= 0; --axis) {
3135 if (m_dim.getTensorDim(continuity[axis]) != 1) {
3139 inner_loop_size *= dim.getTensorDim(continuity[axis]);
3142 /// if consecutive-one strategy has bigger chunk size, replace the
3144 if (inner_loop_size > e.buffer_size) {
3145 e.buffer_axis = axis;
3146 e.buffer_size = inner_loop_size;
3154 Tensor Tensor::rotate_180(Tensor in) {
3155 Tensor output(in.getDim());
3156 if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
3158 for (unsigned int i = 0; i < in.batch(); ++i) {
3159 for (unsigned int j = 0; j < in.channel(); ++j) {
3160 for (unsigned int k = 0; k < in.height(); ++k) {
3161 for (unsigned int l = 0; l < in.width(); ++l) {
3162 output.setValue(i, j, k, l,
3163 in.getValue<float>(i, j, (in.height() - k - 1),
3164 (in.width() - l - 1)));
3170 } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
3172 for (unsigned int i = 0; i < in.batch(); ++i) {
3173 for (unsigned int j = 0; j < in.channel(); ++j) {
3174 for (unsigned int k = 0; k < in.height(); ++k) {
3175 for (unsigned int l = 0; l < in.width(); ++l) {
3176 output.setValue(i, j, k, l,
3177 in.getValue<__fp16>(i, j, (in.height() - k - 1),
3178 (in.width() - l - 1)));
3187 } /* namespace nntrainer */