2 * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * http://www.apache.org/licenses/LICENSE-2.0
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
16 * @date 04 December 2019
17 * @brief This is Tensor class for calculation
18 * @see https://github.com/nnstreamer/nntrainer
19 * @author Jijoong Moon <jijoong.moon@samsung.com>
20 * @bug No known bugs except for NYI items
38 #include <lazy_tensor.h>
40 #include <util_func.h>
42 #define transposeloop(cl, ci, cj, ck, sl, si, sj, sk) \
44 unsigned int i, j, k, l; \
45 int inidx = 0, outidx = 0; \
46 for (cl = 0; cl < sl; cl++) \
47 for (ci = 0; ci < si; ci++) \
48 for (cj = 0; cj < sj; cj++) \
49 for (ck = 0; ck < sk; ck++) { \
50 outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
51 inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k; \
52 outptr[outidx] = inptr[inidx]; \
56 #define transposeloop_nhwc(cl, ci, cj, ck, sl, si, sj, sk) \
58 unsigned int i, j, k, l; \
59 int inidx = 0, outidx = 0; \
60 for (cl = 0; cl < sl; cl++) \
61 for (ci = 0; ci < si; ci++) \
62 for (cj = 0; cj < sj; cj++) \
63 for (ck = 0; ck < sk; ck++) { \
64 outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
65 inidx = l * SJ * SK * SI + j * SK * SI + k * SI + i; \
66 outptr[outidx] = inptr[inidx]; \
73 * @struct External Loop Info for broadcasted info
74 * @brief External Loop Info for broadcasted iteration. Please refer to
75 * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
76 * @note This should better be implemented in iterator fashion before used
79 struct Tensor::BroadcastInfo {
82 * @brief Construct a new External Loop Info object
89 tensor_type(nntrainer::TensorDim::TensorType()) {}
91 unsigned int buffer_size; /**< virtual size of the buffer */
92 int buffer_axis; /**< the smallest axis that should be looped.
93 -1 means no loop needed*/
94 std::array<unsigned int, TensorDim::MAXDIM>
95 strides; /**< modified strides for the loop */
96 nntrainer::TensorDim::TensorType tensor_type;
99 Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
102 if (d.getDataLen() != 0) {
104 strides = d.computeStrides();
111 Tensor::Tensor(const TensorDim &d, const void *buf) : Tensor(d, true) {
112 if (d.getDataLen() != 0) {
119 * @class SrcSharedTensor
120 * @brief Source of the shared tensor
122 class SrcSharedTensor {
125 * @brief Constructor for the class
127 SrcSharedTensor() : src(nullptr), off(0) {}
129 SrcSharedTensor(const Tensor *tensor, size_t offset) :
134 * @brief Get the allocated src tensor
136 const Tensor *tensor() const {
138 throw std::runtime_error("Accessing empty src tensor");
144 * @brief Get the offset from the source tensor
146 size_t offset() const { return off; }
149 const Tensor *src; /**< Tensor of the source */
150 size_t off; /**< offset from the source data ptr */
153 void Tensor::allocate() {
155 /// already allocated
159 /// allocate data based on the source tensor
160 data = src_tensor->tensor()->data;
161 offset = src_tensor->tensor()->offset + src_tensor->offset();
162 /** as this memory is shared, do NOT initialize */
164 /// allocate new memory for the tensor data
166 MemoryData *mem_data;
168 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
169 mem_data = new MemoryData((void *)(new float[dim.getDataLen()]()));
170 data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
171 delete[](float *) mem_data->getAddr();
175 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
177 mem_data = new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
178 data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
179 delete[](__fp16 *) mem_data->getAddr();
183 throw std::invalid_argument("Error: enable-fp16 is not enabled");
191 bool Tensor::operator==(const Tensor &rhs) const {
192 if (this->dim != rhs.dim)
197 if (len != rhs.size())
200 if (contiguous != rhs.contiguous)
203 if (strides != rhs.strides)
206 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
207 const float *_data = getData<float>();
208 const float *_rdata = rhs.getData<float>();
209 for (size_t i = 0; i < len; ++i) {
210 /** not checking sign change is intentional to avoid float calculation
212 if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
213 (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
214 std::fabs(_data[i] - _rdata[i]) > epsilon)
217 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
219 const __fp16 *_data = getData<__fp16>();
220 const __fp16 *_rdata = rhs.getData<__fp16>();
221 for (size_t i = 0; i < len; ++i) {
222 if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
223 (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
224 std::fabs(_data[i] - _rdata[i]) > epsilon)
228 throw std::invalid_argument("Error: enable-fp16 is not enabled");
235 void Tensor::setRandNormal(float mean, float std) {
236 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
237 setDist<float, std::normal_distribution<float>>(
238 std::normal_distribution<float>(mean, std));
239 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
240 throw std::invalid_argument(
241 "__fp16 is not supported by std::normal_distribution");
245 void Tensor::setRandUniform(float min, float max) {
246 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
247 setDist<float, std::uniform_real_distribution<float>>(
248 std::uniform_real_distribution<float>(min, max));
249 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
250 throw std::invalid_argument(
251 "__fp16 is not supported by std::uniform_real_distribution");
255 void Tensor::setRandBernoulli(float probability) {
256 if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
257 setDist<float, std::bernoulli_distribution>(
258 std::bernoulli_distribution(probability));
259 } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
261 setDist<__fp16, std::bernoulli_distribution>(
262 std::bernoulli_distribution((__fp16)probability));
264 throw std::invalid_argument("Error: enable-fp16 is not enabled");
269 void Tensor::initialize() {
270 if (empty() || !isAllocated())
273 unsigned int fan_in, fan_out;
275 /// @fixme: when unit is equal to one, this does not work, we need to rely on
276 /// effective dimension then actual numbers here. For now, some heuristics
277 /// added to infer what would be fan_in/fan_out
278 if (dim.batch() * dim.channel() * dim.height() == 1) {
279 fan_out = fan_in = dim.width();
280 } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
281 fan_in = dim.height();
282 fan_out = dim.width();
283 } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
284 auto field_size = dim.height() * dim.width();
286 // this also handles below cases.
287 // 1. fan_in = fan_out = 1 as well.
288 // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
289 fan_in = dim.channel() * field_size;
290 fan_out = dim.batch() * field_size;
293 switch (initializer) {
294 case Tensor::Initializer::ZEROS:
297 case Tensor::Initializer::ONES:
300 case Tensor::Initializer::LECUN_NORMAL:
301 setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
303 case Tensor::Initializer::XAVIER_NORMAL:
304 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
306 case Tensor::Initializer::HE_NORMAL:
307 setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
309 case Tensor::Initializer::LECUN_UNIFORM:
310 setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
312 case Tensor::Initializer::XAVIER_UNIFORM:
313 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
314 sqrtFloat(6.0 / (fan_in + fan_out)));
316 case Tensor::Initializer::HE_UNIFORM:
317 setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
318 sqrtFloat(6.0 / (fan_in)));
327 int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
329 this->multiply_strided(m, *this, beta);
330 } catch (std::exception &err) {
331 ml_loge("%s %s", typeid(err).name(), err.what());
332 return ML_ERROR_INVALID_PARAMETER;
335 return ML_ERROR_NONE;
338 Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
340 return this->multiply_strided(m, t, beta);
343 Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
344 const float beta) const {
345 /** TODO: throw than create new dimenions */
346 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
348 if (size() != m.size() || size() != output.size())
349 throw std::invalid_argument(
350 "Strided multiplication does not support broadcasting");
352 if (getDataType() == Tdatatype::FP32) {
353 NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
354 << getName() << " is not allocated";
355 NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
356 << m.getName() << " is not allocated";
357 NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
358 << output.getName() << " is not allocated";
359 } else if (getDataType() == Tdatatype::FP16) {
361 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
362 << getName() << " is not allocated";
363 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
364 << m.getName() << " is not allocated";
365 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
366 << output.getName() << " is not allocated";
368 throw std::invalid_argument("Error: enable-fp16 is not enabled");
373 if (this->getFormat() == Tformat::NCHW) {
374 if (getDataType() == Tdatatype::FP32) {
375 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
377 for (unsigned int b = 0; b < batch(); ++b) {
378 for (unsigned int c = 0; c < channel(); ++c) {
379 for (unsigned int h = 0; h < height(); ++h) {
380 for (unsigned int w = 0; w < width(); ++w) {
381 output.addValue(b, c, h, w,
382 getValue<float>(b, c, h, w) *
383 m.getValue<float>(b, c, h, w),
390 /** @todo optimize this with combining these loops where stride is 1
392 for (unsigned int b = 0; b < batch(); ++b) {
393 for (unsigned int c = 0; c < channel(); ++c) {
394 for (unsigned int h = 0; h < height(); ++h) {
395 float *out_data = output.getAddress<float>(b, c, h, 0);
396 const float *m_data = m.getAddress<float>(b, c, h, 0);
397 const float *in_data = getAddress<float>(b, c, h, 0);
398 std::transform(in_data, in_data + width(), m_data, out_data,
399 std::multiplies<float>());
404 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
406 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
408 for (unsigned int b = 0; b < batch(); ++b) {
409 for (unsigned int c = 0; c < channel(); ++c) {
410 for (unsigned int h = 0; h < height(); ++h) {
411 for (unsigned int w = 0; w < width(); ++w) {
412 output.addValue(b, c, h, w,
413 getValue<__fp16>(b, c, h, w) *
414 m.getValue<__fp16>(b, c, h, w),
421 for (unsigned int b = 0; b < batch(); ++b) {
422 for (unsigned int c = 0; c < channel(); ++c) {
423 for (unsigned int h = 0; h < height(); ++h) {
424 __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
425 const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
426 const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
427 std::transform(in_data, in_data + width(), m_data, out_data,
428 std::multiplies<__fp16>());
434 throw std::invalid_argument("Error: enable-fp16 is not enabled");
437 } else { // Format NHWC Case
438 if (getDataType() == Tdatatype::FP32) {
439 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
441 for (unsigned int b = 0; b < batch(); ++b) {
442 for (unsigned int h = 0; h < height(); ++h) {
443 for (unsigned int w = 0; w < width(); ++w) {
444 for (unsigned int c = 0; c < channel(); ++c) {
445 output.addValue(b, c, h, w,
446 getValue<float>(b, c, h, w) *
447 m.getValue<float>(b, c, h, w),
454 /** @todo optimize this with combining these loops where
456 for (unsigned int b = 0; b < batch(); ++b) {
457 for (unsigned int h = 0; h < height(); ++h) {
458 for (unsigned int w = 0; w < width(); ++w) {
459 float *out_data = output.getAddress<float>(b, 0, h, w);
460 const float *m_data = m.getAddress<float>(b, 0, h, w);
461 const float *in_data = getAddress<float>(b, 0, h, w);
462 std::transform(in_data, in_data + channel(), m_data, out_data,
463 std::multiplies<float>());
468 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
470 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
472 for (unsigned int b = 0; b < batch(); ++b) {
473 for (unsigned int h = 0; h < height(); ++h) {
474 for (unsigned int w = 0; w < width(); ++w) {
475 for (unsigned int c = 0; c < channel(); ++c) {
476 output.addValue(b, c, h, w,
477 getValue<__fp16>(b, c, h, w) *
478 m.getValue<__fp16>(b, c, h, w),
485 /** @todo optimize this with combining these loops where
487 for (unsigned int b = 0; b < batch(); ++b) {
488 for (unsigned int h = 0; h < height(); ++h) {
489 for (unsigned int w = 0; w < width(); ++w) {
490 __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
491 const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
492 const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
493 std::transform(in_data, in_data + channel(), m_data, out_data,
494 std::multiplies<__fp16>());
500 throw std::invalid_argument("Error: enable-fp16 is not enabled");
508 int Tensor::add_i_strided(Tensor const &m, const float beta) {
510 this->add_strided(m, *this, beta);
511 } catch (std::exception &err) {
512 ml_loge("%s %s", typeid(err).name(), err.what());
513 return ML_ERROR_INVALID_PARAMETER;
516 return ML_ERROR_NONE;
519 Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
521 return this->add_strided(m, t, beta);
524 Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
525 const float beta) const {
526 /** TODO: throw than create new dimenions */
527 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
529 if (size() != m.size() || size() != output.size())
530 throw std::invalid_argument(
531 "Strided addition does not support broadcasting");
533 if (getDataType() == Tdatatype::FP32) {
534 NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
535 << getName() << " is not allocated";
536 NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
537 << m.getName() << " is not allocated";
538 NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
539 << output.getName() << " is not allocated";
540 } else if (getDataType() == Tdatatype::FP16) {
542 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
543 << getName() << " is not allocated";
544 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
545 << m.getName() << " is not allocated";
546 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
547 << output.getName() << " is not allocated";
549 throw std::invalid_argument("Error: enable-fp16 is not enabled");
554 if (this->getFormat() == Tformat::NCHW) {
555 if (getDataType() == Tdatatype::FP32) {
556 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
558 for (unsigned int b = 0; b < batch(); ++b) {
559 for (unsigned int c = 0; c < channel(); ++c) {
560 for (unsigned int h = 0; h < height(); ++h) {
561 for (unsigned int w = 0; w < width(); ++w) {
562 output.setValue(b, c, h, w,
563 getValue<float>(b, c, h, w) +
564 m.getValue<float>(b, c, h, w) * beta);
570 /** @todo optimize this with combining these loops where stride is 1 */
571 for (unsigned int b = 0; b < batch(); ++b) {
572 for (unsigned int c = 0; c < channel(); ++c) {
573 for (unsigned int h = 0; h < height(); ++h) {
574 float *out_data = output.getAddress<float>(b, c, h, 0);
575 const float *m_data = m.getAddress<float>(b, c, h, 0);
576 const float *in_data = getAddress<float>(b, c, h, 0);
577 std::transform(in_data, in_data + width(), m_data, out_data,
583 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
585 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
587 for (unsigned int b = 0; b < batch(); ++b) {
588 for (unsigned int c = 0; c < channel(); ++c) {
589 for (unsigned int h = 0; h < height(); ++h) {
590 for (unsigned int w = 0; w < width(); ++w) {
591 output.setValue(b, c, h, w,
592 getValue<__fp16>(b, c, h, w) +
593 m.getValue<__fp16>(b, c, h, w) * beta);
599 for (unsigned int b = 0; b < batch(); ++b) {
600 for (unsigned int c = 0; c < channel(); ++c) {
601 for (unsigned int h = 0; h < height(); ++h) {
602 __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
603 const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
604 const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
605 std::transform(in_data, in_data + width(), m_data, out_data,
606 std::plus<__fp16>());
612 throw std::invalid_argument("Error: enable-fp16 is not enabled");
615 } else { // Format NHWC Case
616 if (getDataType() == Tdatatype::FP32) {
617 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
619 for (unsigned int b = 0; b < batch(); ++b) {
620 for (unsigned int h = 0; h < height(); ++h) {
621 for (unsigned int w = 0; w < width(); ++w) {
622 for (unsigned int c = 0; c < channel(); ++c) {
623 output.setValue(b, c, h, w,
624 getValue<float>(b, c, h, w) +
625 m.getValue<float>(b, c, h, w) * beta);
631 /** @todo optimize this with combining these loops where
633 for (unsigned int b = 0; b < batch(); ++b) {
634 for (unsigned int h = 0; h < height(); ++h) {
635 for (unsigned int w = 0; w < width(); ++w) {
636 float *out_data = output.getAddress<float>(b, 0, h, w);
637 const float *m_data = m.getAddress<float>(b, 0, h, w);
638 const float *in_data = getAddress<float>(b, 0, h, w);
639 std::transform(in_data, in_data + channel(), m_data, out_data,
645 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
647 if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
649 for (unsigned int b = 0; b < batch(); ++b) {
650 for (unsigned int h = 0; h < height(); ++h) {
651 for (unsigned int w = 0; w < width(); ++w) {
652 for (unsigned int c = 0; c < channel(); ++c) {
653 output.setValue(b, c, h, w,
654 getValue<__fp16>(b, c, h, w) +
655 m.getValue<__fp16>(b, c, h, w) * beta);
661 /** @todo optimize this with combining these loops where
663 for (unsigned int b = 0; b < batch(); ++b) {
664 for (unsigned int h = 0; h < height(); ++h) {
665 for (unsigned int w = 0; w < width(); ++w) {
666 __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
667 const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
668 const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
669 std::transform(in_data, in_data + channel(), m_data, out_data,
670 std::plus<__fp16>());
676 throw std::invalid_argument("Error: enable-fp16 is not enabled");
683 int Tensor::multiply_i(float const &value) {
684 NNTR_THROW_IF(!contiguous, std::invalid_argument)
685 << getName() << " is not contiguous, cannot multiply";
687 /// @note this is not depending on multiply_i as there is an optimized
688 /// version for multiply_i
689 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
690 float *data = getData<float>();
691 unsigned int len = size();
693 sscal(len, value, data, 1);
694 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
696 __fp16 *data = getData<__fp16>();
697 unsigned int len = size();
698 sscal(len, value, data, 1);
700 throw std::invalid_argument("Error: enable-fp16 is not enabled");
703 return ML_ERROR_NONE;
706 Tensor Tensor::multiply(float const &value) const {
708 return multiply(value, t);
711 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
712 /// @todo add unittest
713 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
714 auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
715 return apply(f, out);
716 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
718 auto f = std::bind(std::multiplies<__fp16>(), std::placeholders::_1, value);
719 return apply(f, out);
721 throw std::invalid_argument("Error: enable-fp16 is not enabled");
727 int Tensor::multiply_i(Tensor const &m, const float beta) {
729 this->multiply(m, *this, beta);
730 } catch (std::exception &err) {
731 ml_loge("%s %s", typeid(err).name(), err.what());
732 return ML_ERROR_INVALID_PARAMETER;
735 return ML_ERROR_NONE;
738 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
739 Tensor t("", this->getFormat());
740 return this->multiply(m, t, beta);
743 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
744 const float beta) const {
746 * @note this does not work correctly with differently strided inputs.
747 * Use multiply_strided alternatively
749 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
750 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
752 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
754 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
755 std::multiplies<float>());
757 for (unsigned int i = 0; i < e.buffer_size; ++i) {
758 *out_buf = *buf * *m_buf + beta * *out_buf;
760 m_buf += e.strides[3];
761 out_buf += output.strides[3];
766 NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
767 << "Tensor Format of " << getName() << ":"
768 << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
769 << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
771 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
772 std::invalid_argument)
773 << getName() << " is not contiguous, cannot multiply";
775 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
776 std::invalid_argument)
777 << getName() << " is not contiguous, cannot multiply";
779 apply_broadcast(m, f, output);
782 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
784 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
786 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
788 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
789 std::multiplies<__fp16>());
791 for (unsigned int i = 0; i < e.buffer_size; ++i) {
792 *out_buf = *buf * *m_buf + beta * *out_buf;
794 m_buf += e.strides[3];
795 out_buf += output.strides[3];
800 NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
801 << "Tensor Format of " << getName() << ":"
802 << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
803 << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
805 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
806 std::invalid_argument)
807 << getName() << " is not contiguous, cannot multiply";
809 apply_broadcast(m, f, output);
812 throw std::invalid_argument("Error: enable-fp16 is not enabled");
818 int Tensor::divide_i(float const &value) {
820 return ML_ERROR_INVALID_PARAMETER;
822 this->divide(value, *this);
823 return ML_ERROR_NONE;
826 Tensor Tensor::divide(float const &value) const {
828 return divide(value, t);
831 Tensor &Tensor::divide(float const &value, Tensor &out) const {
832 auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
833 /// @todo add unittest, __fp16 ZeroDivisionError
835 std::stringstream ss;
836 ss << "[Tensor] divide by value failed, value: " << value;
837 throw std::invalid_argument(ss.str().c_str());
839 return apply(f, out);
842 int Tensor::divide_i(Tensor const &m) {
844 this->divide(m, *this);
845 } catch (std::exception &err) {
846 ml_loge("%s %s", typeid(err).name(), err.what());
847 return ML_ERROR_INVALID_PARAMETER;
850 return ML_ERROR_NONE;
853 Tensor Tensor::divide(Tensor const &m) const {
855 return this->divide(m, t);
858 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
859 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
860 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
862 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
863 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
864 std::divides<float>());
866 for (unsigned int i = 0; i < e.buffer_size; ++i) {
867 *out_buf = *buf / *m_buf;
869 m_buf += e.strides[3];
870 out_buf += output.strides[3];
875 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
876 std::invalid_argument)
877 << getName() << " is not contiguous, cannot divide";
879 apply_broadcast(m, f, output);
880 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
882 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
884 if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
885 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
886 std::divides<__fp16>());
888 for (unsigned int i = 0; i < e.buffer_size; ++i) {
889 *out_buf = *buf / *m_buf;
891 m_buf += e.strides[3];
892 out_buf += output.strides[3];
897 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
898 std::invalid_argument)
899 << getName() << " is not contiguous, cannot divide";
901 apply_broadcast(m, f, output);
903 throw std::invalid_argument("Error: enable-fp16 is not enabled");
909 int Tensor::add_i(float const &value) {
910 this->add(value, *this);
911 return ML_ERROR_NONE;
914 Tensor Tensor::add(float const &value) const {
916 return add(value, t);
919 Tensor &Tensor::add(float const &value, Tensor &out) const {
920 /// @todo add unittest
921 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
922 auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
923 return apply(f, out);
924 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
926 auto f = std::bind(std::plus<__fp16>(), std::placeholders::_1, value);
927 return apply(f, out);
929 throw std::invalid_argument("Error: enable-fp16 is not enabled");
935 int Tensor::add_i(Tensor const &m, float const alpha) {
936 /// @todo: add axis rather doing add over the last two dimensions always
937 /// operator i has optimized version
938 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
939 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
941 saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
944 /// @todo: enable this after add_strided supports broadcast
945 // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
946 // << getName() << " is not contiguous, cannot add";
949 apply_broadcast(m, f, *this);
950 } catch (std::exception &err) {
951 ml_loge("%s %s", typeid(err).name(), err.what());
952 return ML_ERROR_INVALID_PARAMETER;
955 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
957 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
959 saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
960 /// @todo: saxpy is not valid for __fp16
963 /// @todo: enable this after add_strided supports broadcast
964 // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
965 // << getName() << " is not contiguous, cannot add";
968 apply_broadcast(m, f, *this);
969 } catch (std::exception &err) {
970 ml_loge("%s %s", typeid(err).name(), err.what());
971 return ML_ERROR_INVALID_PARAMETER;
975 ml_loge("%s", "Error: enable-fp16 is not enabled");
976 return ML_ERROR_INVALID_PARAMETER;
979 return ML_ERROR_NONE;
982 Tensor Tensor::add(Tensor const &m, float const alpha) const {
984 return this->add(m, t, alpha);
987 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
988 NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
989 std::invalid_argument)
990 << getName() << " is not contiguous, cannot add";
992 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
993 auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
995 if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
997 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
1000 for (unsigned int i = 0; i < e.buffer_size; ++i) {
1001 *out_buf = *buf + *m_buf * alpha;
1003 m_buf += e.strides[3];
1004 out_buf += strides[3];
1008 apply_broadcast(m, f, output);
1009 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1011 auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
1013 if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
1015 std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
1016 std::plus<__fp16>());
1018 for (unsigned int i = 0; i < e.buffer_size; ++i) {
1019 *out_buf = *buf + *m_buf * alpha;
1021 m_buf += e.strides[3];
1022 out_buf += strides[3];
1026 apply_broadcast(m, f, output);
1028 throw std::invalid_argument("Error: enable-fp16 is not enabled");
1034 int Tensor::subtract_i(float const &value) {
1035 this->subtract(value, *this);
1036 return ML_ERROR_NONE;
1039 Tensor Tensor::subtract(float const &value) const {
1041 return subtract(value, t);
1044 Tensor &Tensor::subtract(float const &value, Tensor &out) const {
1045 /// @todo add unittest
1046 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1047 auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
1048 return apply(f, out);
1049 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1051 auto f = std::bind(std::minus<__fp16>(), std::placeholders::_1, value);
1052 return apply(f, out);
1054 ml_loge("%s", "Error: enable-fp16 is not enabled");
1057 return out; // shouldn't reach
1060 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
1062 Tensor Tensor::subtract(Tensor const &m) const { return add(m, -1); }
1064 Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
1065 return add(m, out, -1);
1068 int Tensor::pow_i(float exponent) {
1069 pow(exponent, *this);
1070 return ML_ERROR_NONE;
1073 Tensor Tensor::pow(float exponent) const {
1075 return pow(exponent, t);
1078 Tensor &Tensor::pow(float exponent, Tensor &out) const {
1079 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1080 auto f = [exponent](float in) { return powf(in, exponent); };
1081 return apply(f, out);
1083 if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1085 auto f = [exponent](__fp16 in) { return powf(in, exponent); };
1086 return apply(f, out);
1088 ml_loge("%s", "Error: enable-fp16 is not enabled");
1094 Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
1095 TensorDim dim_ = dim;
1098 return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
1101 void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
1104 * - If src already has data allocaed, then directly make dest tensor based on
1106 * - If src.data does not exist (meaning tensor does not memory allocated),
1107 * and src.src_tensor does not exist (meaning the src tensor does not depened
1108 * on another tensor), then create a SrcSharedTensor around the src.
1109 * - If src.src_tensor exists, then use the src.src_tensor to create the
1110 * required SrcSharedTensor to avoid recursive dependency.
1112 * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
1113 * if the batch size of src is updated and needs reallocation.
1115 dest.data = nullptr;
1117 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1119 } else if (!src.src_tensor)
1120 dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1122 dest.src_tensor = std::make_shared<SrcSharedTensor>(
1123 src.src_tensor->tensor(), offset + src.src_tensor->offset());
1126 Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
1128 const std::string &name_) const {
1130 if (dim_.getFormat() != ret.dim.getFormat())
1131 throw std::invalid_argument("Tensor format does not match");
1137 if (dim_.getDataLen() + offset > dim.getDataLen())
1138 throw std::invalid_argument(
1139 "Creating shared tensor of size bigger than tensor memory.");
1142 ret.strides = ret.dim.computeStrides();
1144 TensorDim new_match_dim = dim_;
1145 new_match_dim.batch(dim.batch());
1146 if (new_match_dim != dim && !reset_stride)
1147 ret.contiguous = false;
1150 * In this case, its the caller's responsibility to ensure that allocate() is
1151 * called for the output tensor before operating on the output tensor.
1153 createSharedDataTensor(*this, ret, offset);
1158 std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
1159 NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1160 << "num size cannot be zero";
1166 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1167 << "cannot split axis of axis: " << axis;
1169 NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
1170 << "axis is not divisible by num_size, axis: " << axis
1171 << " num size: " << num_size;
1173 std::vector<size_t> sizes;
1174 sizes.resize(num_size);
1176 unsigned int sz = dim.getTensorDim(axis) / num_size;
1177 std::fill(sizes.begin(), sizes.end(), sz);
1179 return split(sizes, axis);
1182 std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
1183 size_t num_size = sizes.size();
1185 NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1186 << "num size cannot be zero";
1192 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1193 << "cannot split axis of axis: " << axis;
1196 std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
1197 std::invalid_argument)
1198 << "among given sizes at least one of size is 0";
1200 size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
1201 NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
1202 << "given sum of sizes did not match with origin tensor dim, tensor dim: "
1203 << dim.getTensorDim(axis) << " total size: " << total_size;
1205 std::vector<TensorDim> ret_dims;
1206 ret_dims.reserve(num_size);
1207 for (unsigned int i = 0; i < num_size; ++i) {
1209 ret_dims[i].setTensorDim(axis, sizes[i]);
1212 bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
1213 std::vector<Tensor> ret;
1215 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1216 auto iter_value = [this, is_format_nchw](
1217 std::array<size_t, 4> &loc,
1218 const std::array<size_t, 4> &end_loc,
1219 const std::array<size_t, 4> &reset_dim_arr) -> float & {
1220 auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
1221 : getValue(loc[0], loc[3], loc[1], loc[2]);
1222 for (int i = 3; i >= 0; --i) {
1224 if (loc[i] == end_loc[i]) {
1225 loc[i] -= reset_dim_arr[i];
1233 ret.reserve(num_size);
1235 unsigned int accumulated_size = 0;
1236 for (unsigned int i = 0; i < num_size; ++i) {
1237 std::array<size_t, 4> loc = {0, 0, 0, 0};
1239 if (is_format_nchw) {
1240 loc[axis] += accumulated_size;
1243 loc[0] += accumulated_size;
1244 } else if (axis == 1) {
1245 loc[3] += accumulated_size;
1246 } else if (axis == 2 || axis == 3) {
1247 loc[axis - 1] += accumulated_size;
1251 ret.emplace_back(ret_dims[i]);
1252 auto &ret_t = ret.back();
1254 std::array<size_t, 4> end_loc;
1256 if (is_format_nchw) {
1257 end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1258 ret_dims[i].height(), ret_dims[i].width()};
1260 end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1261 ret_dims[i].width(), ret_dims[i].channel()};
1264 accumulated_size += sizes[i];
1266 if (is_format_nchw) {
1267 end_loc[axis] = accumulated_size;
1270 end_loc[0] = accumulated_size;
1271 } else if (axis == 1) {
1272 end_loc[3] = accumulated_size;
1273 } else if (axis == 2 || axis == 3) {
1274 end_loc[axis - 1] = accumulated_size;
1278 std::array<size_t, 4> reset_dim_arr;
1279 if (is_format_nchw) {
1280 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1281 ret_dims[i].height(), ret_dims[i].width()};
1283 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1284 ret_dims[i].width(), ret_dims[i].channel()};
1287 ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1288 return iter_value(loc, end_loc, reset_dim_arr);
1292 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1295 [this, is_format_nchw](
1296 std::array<size_t, 4> &loc, const std::array<size_t, 4> &end_loc,
1297 const std::array<size_t, 4> &reset_dim_arr) -> __fp16 & {
1298 auto &value = (is_format_nchw)
1299 ? getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1300 : getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1301 for (int i = 3; i >= 0; --i) {
1303 if (loc[i] == end_loc[i]) {
1304 loc[i] -= reset_dim_arr[i];
1312 ret.reserve(num_size);
1314 unsigned int accumulated_size = 0;
1315 for (unsigned int i = 0; i < num_size; ++i) {
1316 std::array<size_t, 4> loc = {0, 0, 0, 0};
1318 if (is_format_nchw) {
1319 loc[axis] += accumulated_size;
1322 loc[0] += accumulated_size;
1323 } else if (axis == 1) {
1324 loc[3] += accumulated_size;
1325 } else if (axis == 2 || axis == 3) {
1326 loc[axis - 1] += accumulated_size;
1330 ret.emplace_back(ret_dims[i]);
1331 auto &ret_t = ret.back();
1333 std::array<size_t, 4> end_loc;
1335 if (is_format_nchw) {
1336 end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1337 ret_dims[i].height(), ret_dims[i].width()};
1339 end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1340 ret_dims[i].width(), ret_dims[i].channel()};
1343 accumulated_size += sizes[i];
1345 if (is_format_nchw) {
1346 end_loc[axis] = accumulated_size;
1349 end_loc[0] = accumulated_size;
1350 } else if (axis == 1) {
1351 end_loc[3] = accumulated_size;
1352 } else if (axis == 2 || axis == 3) {
1353 end_loc[axis - 1] = accumulated_size;
1357 std::array<size_t, 4> reset_dim_arr;
1358 if (is_format_nchw) {
1359 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1360 ret_dims[i].height(), ret_dims[i].width()};
1362 reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1363 ret_dims[i].width(), ret_dims[i].channel()};
1366 ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1367 return iter_value(loc, end_loc, reset_dim_arr);
1372 throw std::invalid_argument("Error: enable-fp16 is not enabled");
1379 Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
1385 NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1386 << "cannot split axis of axis: " << axis;
1388 NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
1389 << "given tensor vector is empty";
1392 auto ref_dim = tensors.front().getDim();
1393 bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
1394 ref_dim.setTensorDim(axis, 1);
1395 NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
1396 [&ref_dim, axis](const Tensor &t) {
1397 auto cur_dim = t.getDim();
1398 cur_dim.setTensorDim(axis, 1);
1399 return ref_dim == cur_dim;
1401 std::invalid_argument)
1402 << " all tensor must have the same dimension except for the axis, ref_dim: "
1403 << ref_dim << " axis : " << axis;
1405 auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
1406 [axis](unsigned cur, const Tensor &t) {
1407 return cur += t.getDim().getTensorDim(axis);
1409 if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1411 [is_format_nchw](std::array<unsigned, 4> &loc,
1412 const std::array<unsigned, 4> &start_loc, Tensor &t,
1413 const std::array<unsigned, 4> &ref_dim_arr) -> float & {
1414 auto &value = is_format_nchw
1415 ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
1416 : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
1418 for (int i = 3; i >= 0; --i) {
1420 if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1421 loc[i] = start_loc[i];
1429 auto ret_dim = ref_dim;
1430 ret_dim.setTensorDim(axis, axis_dim);
1432 ret = Tensor(ret_dim);
1434 std::array<unsigned, 4> loc = {0, 0, 0, 0};
1435 for (auto &t : tensors) {
1436 std::array<unsigned, 4> start_loc = loc;
1437 std::array<unsigned, 4> tensor_dim_arr;
1438 if (is_format_nchw) {
1439 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1440 tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1441 tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1442 tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1444 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1445 tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1446 tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1447 tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1450 for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1451 iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
1454 if (is_format_nchw) {
1455 loc[axis] += t.getDim().getTensorDim(axis);
1458 loc[0] += t.getDim().getTensorDim(axis);
1459 } else if (axis == 1) {
1460 loc[3] += t.getDim().getTensorDim(axis);
1461 } else if (axis == 2 || axis == 3) {
1462 loc[axis - 1] += t.getDim().getTensorDim(axis);
1468 } else if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1471 [is_format_nchw](std::array<unsigned, 4> &loc,
1472 const std::array<unsigned, 4> &start_loc, Tensor &t,
1473 const std::array<unsigned, 4> &ref_dim_arr) -> __fp16 & {
1474 auto &value = is_format_nchw
1475 ? t.getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1476 : t.getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1478 for (int i = 3; i >= 0; --i) {
1480 if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1481 loc[i] = start_loc[i];
1489 auto ret_dim = ref_dim;
1490 ret_dim.setTensorDim(axis, axis_dim);
1492 ret = Tensor(ret_dim);
1494 std::array<unsigned, 4> loc = {0, 0, 0, 0};
1495 for (auto &t : tensors) {
1496 std::array<unsigned, 4> start_loc = loc;
1497 std::array<unsigned, 4> tensor_dim_arr;
1498 if (is_format_nchw) {
1499 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1500 tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1501 tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1502 tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1504 tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1505 tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1506 tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1507 tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1510 for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1511 iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<__fp16>(i);
1514 if (is_format_nchw) {
1515 loc[axis] += t.getDim().getTensorDim(axis);
1518 loc[0] += t.getDim().getTensorDim(axis);
1519 } else if (axis == 1) {
1520 loc[3] += t.getDim().getTensorDim(axis);
1521 } else if (axis == 2 || axis == 3) {
1522 loc[axis - 1] += t.getDim().getTensorDim(axis);
1528 throw std::invalid_argument("Error: enable-fp16 is not enabled");
1534 void Tensor::makeSharedDataTensor(const Tensor &src, size_t offset) {
1535 if (strides != src.strides)
1536 throw std::invalid_argument(
1537 "Creating shared tensor of different stride than source tensor.");
1539 if (getDim().getDataLen() + offset > src.getDim().getDataLen())
1540 throw std::invalid_argument(
1541 "Creating shared tensor of different size or stride than source tensor.");
1544 * In this case, its the caller's responsibility to ensure that allocate() is
1545 * called for the output tensor before operating on the output tensor.
1547 createSharedDataTensor(src, *this, offset);
1550 void Tensor::apply_broadcast(
1552 std::function<void(const BroadcastInfo &e, const float *, const float *,
1555 Tensor &output) const {
1556 CREATE_IF_EMPTY_DIMS(output, dim);
1558 NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
1559 << getName() << " is not allocated";
1560 NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
1561 << m.getName() << " is not allocated";
1562 NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
1563 << output.getName() << " is not allocated";
1565 /// shortcut to cover when dimension matches
1566 /// note that buffer_size, the last stride is only used in v_func but it
1567 /// might be changed
1570 e.buffer_size = size();
1572 e.tensor_type = getTensorType();
1573 v_func(e, getData(), m.getData(), output.getData());
1577 return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1581 void Tensor::apply_broadcast(
1583 std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1586 Tensor &output) const {
1587 CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
1589 NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
1590 << getName() << " is not allocated";
1591 NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
1592 << m.getName() << " is not allocated";
1593 NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
1594 << output.getName() << " is not allocated";
1596 /// shortcut to cover when dimension matches
1597 /// note that buffer_size, the last stride is only used in v_func but it
1598 /// might be changed
1601 e.buffer_size = size();
1603 v_func(e, getData<__fp16>(), m.getData<__fp16>(), output.getData<__fp16>());
1607 return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1610 void Tensor::apply_broadcast_util(
1612 std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1615 Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1616 size_t m_offset) const {
1618 const __fp16 *buf = this->getData<__fp16>();
1619 const __fp16 *m_buf = m.getData<__fp16>();
1620 __fp16 *out_buf = output.getData<__fp16>();
1622 if (e.buffer_axis == cur_axis) {
1623 v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1628 for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
1629 size_t next_offset = offset + i * strides[cur_axis];
1630 size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1631 apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1638 void Tensor::apply_broadcast_util(
1640 std::function<void(const BroadcastInfo &e, const float *, const float *,
1643 Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1644 size_t m_offset) const {
1646 const float *buf = this->getData();
1647 const float *m_buf = m.getData();
1648 float *out_buf = output.getData();
1650 if (e.buffer_axis == cur_axis) {
1651 v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1656 uint continuity[4] = {0, 1, 2, 3};
1657 if (getFormat() == Tformat::NHWC) {
1662 for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
1663 size_t next_offset = offset + i * strides[cur_axis];
1664 size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1665 apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1671 * This is to sum the Tensor data according to the dim.batch().
1672 * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
1674 Tensor Tensor::sum_by_batch() const {
1675 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1676 << getName() << " is not contiguous, cannot sum";
1678 Tensor ret(dim.batch(), 1, 1, 1, this->getFormat(), getDataType());
1679 size_t feat_len = dim.getFeatureLen();
1680 size_t batch = dim.batch();
1682 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1683 const float *data = getData();
1684 float *rdata = ret.getData();
1686 Tensor ones(1, 1, 1, feat_len, this->getFormat());
1688 sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1689 ones.getData<float>(), 1, 0.0, rdata, 1);
1690 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1692 const __fp16 *data = getData<__fp16>();
1693 __fp16 *rdata = ret.getData<__fp16>();
1695 Tensor ones(1, 1, 1, feat_len, this->getTensorType());
1696 ones.setValue((__fp16)1.0);
1697 sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1698 ones.getData<__fp16>(), 1, 0.0, rdata, 1);
1700 throw std::invalid_argument("Error: enable-fp16 is not enabled");
1708 * @brief Calculate sum according to the axis.
1710 Tensor Tensor::sum(unsigned int axis, float alpha) const {
1711 Tensor ret("", this->getFormat(), this->getDataType());
1712 return sum(axis, ret, alpha, 0);
1715 Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
1718 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1719 const float *data = getData<float>();
1721 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1722 << getName() << " is not contiguous, cannot sum";
1725 throw std::out_of_range("Error: axis is invalid");
1727 if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1728 CREATE_IF_EMPTY_DIMS(ret, dim);
1729 ret.copy(this->getData());
1735 CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1736 this->getTensorType());
1737 size_t feat_len = dim.getFeatureLen();
1738 size_t batch = dim.batch();
1739 Tensor ones(1, 1, 1, batch, this->getFormat());
1740 ones.setValue(alpha);
1741 sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1742 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1745 CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1746 if (this->getFormat() == Tformat::NHWC) {
1747 unsigned int m = ret.dim.getDataLen();
1748 unsigned int n = dim[1];
1749 Tensor ones(1, 1, 1, n, this->getTensorType());
1750 ones.setValue(alpha);
1751 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1752 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1754 unsigned int feat_len = dim[2] * dim[3];
1755 unsigned int t_axis = dim[1];
1756 Tensor ones(1, 1, 1, t_axis, getTensorType());
1757 ones.setValue(alpha);
1758 float *rdata = ret.getData<float>();
1759 for (unsigned int k = 0; k < dim[0]; ++k) {
1760 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1761 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1762 1, beta, &rdata[k * feat_len], 1);
1767 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1769 if (this->getFormat() == Tformat::NHWC) {
1770 unsigned int feat_len = dim[1] * dim[3];
1771 unsigned int t_axis = dim[2];
1772 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1773 ones.setValue(alpha);
1774 float *rdata = ret.getData<float>();
1775 for (unsigned int k = 0; k < dim[0]; ++k) {
1776 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1777 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1778 1, beta, &rdata[k * feat_len], 1);
1781 unsigned int t_3 = dim[3];
1782 unsigned int t_axis = dim[2];
1783 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1784 ones.setValue(alpha);
1785 float *rdata = ret.getData<float>();
1786 for (unsigned int k = 0; k < dim[0]; ++k) {
1787 for (unsigned int c = 0; c < dim[1]; ++c) {
1788 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1789 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1790 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1791 ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1797 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
1798 this->getTensorType());
1799 if (this->getFormat() == Tformat::NHWC) {
1800 unsigned int t_3 = dim[1];
1801 unsigned int t_axis = dim[3];
1802 Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1803 ones.setValue(alpha);
1804 float *rdata = ret.getData<float>();
1805 for (unsigned int k = 0; k < dim[0]; ++k) {
1806 for (unsigned int c = 0; c < dim[2]; ++c) {
1807 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1808 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1809 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1810 ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1814 unsigned int m = ret.dim.getDataLen();
1815 unsigned int n = dim[3];
1816 Tensor ones(1, 1, 1, n);
1817 ones.setValue(alpha);
1818 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1819 ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1823 throw std::out_of_range("Error: Dimension cannot exceed 3");
1825 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1827 const __fp16 *data = getData<__fp16>();
1829 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1830 << getName() << " is not contiguous, cannot sum";
1833 throw std::out_of_range("Error: axis is invalid");
1835 if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1836 CREATE_IF_EMPTY_DIMS(ret, dim);
1837 ret.copy(this->getData<__fp16>());
1843 CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1844 this->getTensorType());
1845 size_t feat_len = dim.getFeatureLen();
1846 size_t batch = dim.batch();
1847 Tensor ones(1, 1, 1, batch, this->getTensorType());
1848 ones.setValue(alpha);
1849 sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1850 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1853 CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1854 if (this->getFormat() == Tformat::NHWC) {
1855 unsigned int m = ret.dim.getDataLen();
1856 unsigned int n = dim[1];
1857 Tensor ones(1, 1, 1, n, this->getTensorType());
1858 ones.setValue(alpha);
1859 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1860 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1862 unsigned int feat_len = dim[2] * dim[3];
1863 unsigned int t_axis = dim[1];
1864 Tensor ones(1, 1, 1, t_axis, getTensorType());
1865 ones.setValue(alpha);
1866 __fp16 *rdata = ret.getData<__fp16>();
1867 for (unsigned int k = 0; k < dim[0]; ++k) {
1868 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1869 &data[k * dim.getFeatureLen()], feat_len,
1870 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1875 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1877 if (this->getFormat() == Tformat::NHWC) {
1878 unsigned int feat_len = dim[1] * dim[3];
1879 unsigned int t_axis = dim[2];
1880 Tensor ones(1, 1, 1, t_axis, getTensorType());
1881 ones.setValue(alpha);
1882 __fp16 *rdata = ret.getData<__fp16>();
1883 for (unsigned int k = 0; k < dim[0]; ++k) {
1884 sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1885 &data[k * dim.getFeatureLen()], feat_len,
1886 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1889 unsigned int t_3 = dim[3];
1890 unsigned int t_axis = dim[2];
1891 Tensor ones(1, 1, 1, t_axis, getTensorType());
1892 ones.setValue(alpha);
1893 __fp16 *rdata = ret.getData<__fp16>();
1894 for (unsigned int k = 0; k < dim[0]; ++k) {
1895 for (unsigned int c = 0; c < dim[1]; ++c) {
1896 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1897 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1898 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1899 ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1905 CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1, getTensorType());
1906 if (this->getFormat() == Tformat::NHWC) {
1907 unsigned int t_3 = dim[1];
1908 unsigned int t_axis = dim[3];
1909 Tensor ones(1, 1, 1, t_axis, getTensorType());
1910 ones.setValue(alpha);
1911 __fp16 *rdata = ret.getData<__fp16>();
1912 for (unsigned int k = 0; k < dim[0]; ++k) {
1913 for (unsigned int c = 0; c < dim[2]; ++c) {
1914 unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1915 unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1916 sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1917 ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1921 unsigned int m = ret.dim.getDataLen();
1922 unsigned int n = dim[3];
1923 Tensor ones(1, 1, 1, n, getTensorType());
1924 ones.setValue(alpha);
1925 sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1926 ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1930 throw std::out_of_range("Error: Dimension cannot exceed 3");
1933 throw std::invalid_argument("Error: enable-fp16 is not enabled");
1939 Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
1940 Tensor ret("", this->getFormat());
1941 return sum(axes, ret, alpha);
1944 void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
1945 std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1946 NNTR_THROW_IF(!contiguous, std::invalid_argument)
1947 << getName() << " is not contiguous, cannot merge axis";
1949 if (axis2 != axis1 + 1)
1950 if (!checkContinuous(axis1, axis2))
1951 throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
1953 dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
1954 dim.setTensorDim(axis1, 1);
1957 Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
1958 float alpha) const {
1960 throw std::invalid_argument("empty axes given");
1962 if (axes.size() == 1) {
1963 this->sum(axes[0], output, alpha);
1965 /** club axes together */
1966 Tensor new_reshaped = *this;
1967 std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1968 std::vector<unsigned int> new_axes = {axes[0]};
1970 for (unsigned int i = 1; i < axes.size(); ++i) {
1971 if (checkContinuous(axes[i - 1], axes[i])) {
1972 new_reshaped.mergeAxis(axes[i - 1], axes[i]);
1973 new_axes.back() = axes[i];
1975 new_axes.push_back(axes[i]);
1979 Tensor ret = new_reshaped.sum(new_axes[0]);
1980 for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
1981 ret = ret.sum(axes[i]);
1982 ret.sum(new_axes.back(), output, alpha);
1988 Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
1989 bool trans_m, float beta) const {
1990 if (!result.isAllocated())
1991 throw std::invalid_argument(
1992 "Output tensor must be preallocated for dotBatched operation");
1993 for (unsigned int b = 0; b < batch(); b++) {
1994 /** @todo try using transpose to speedup the operation */
1995 const Tensor this_b = this->getBatchSlice(b, 1);
1996 Tensor m_b = m.getBatchSlice(b, 1);
1997 Tensor result_b = result.getBatchSlice(b, 1);
1999 this_b.dot(m_b, result_b, trans, trans_m, beta);
2005 Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
2006 Tensor output("", this->getFormat(), this->getDataType());
2007 dot(m, output, trans, trans_m);
2012 * @brief compute the derivative of this in the current tensor
2013 * @todo will have to see if beta effects this computation
2015 Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
2016 bool trans, bool trans_m, float beta) {
2017 bool deriv_trans_m = true;
2018 bool deriv_trans = false;
2019 /** @todo handle all cases of trans and trans_m */
2020 if (!trans && trans_m) {
2021 deriv_trans_m = false;
2024 return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
2028 * @brief compute the derivative wrt m in the m tensor
2029 * @note The caller tensor must be the same tensor as the one which called the
2032 Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
2033 bool trans, bool trans_m, float beta) const {
2034 bool deriv_trans_m = false;
2035 bool deriv_trans = true;
2036 /** @todo handle all cases of trans and trans_m */
2038 if (!trans && trans_m) {
2039 output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
2042 return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
2046 Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
2047 Tensor const &output_deriv, bool trans,
2048 bool trans_m, float beta) {
2049 bool deriv_trans_m = true;
2050 bool deriv_trans = false;
2051 /** @todo handle all cases of trans and trans_m */
2052 if (!trans && trans_m) {
2053 deriv_trans_m = false;
2056 return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
2059 Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
2060 Tensor const &output_deriv, bool trans,
2061 bool trans_m, float beta) const {
2062 bool deriv_trans_m = false;
2063 bool deriv_trans = true;
2064 /** @todo handle all cases of trans and trans_m */
2066 if (!trans && trans_m) {
2067 output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
2070 return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
2075 * @note: This dot product flattens the fist 3 axis for the purpose of
2076 * computation. So, while performing, these matrices are behaving as 2-D
2077 * matrices. The dimensions are restored while returning back the tensor
2078 * in case of trans is false.
2080 Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
2082 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2083 << getName() << " is not contiguous. Cannot dot product.";
2085 // Comment out with intension to support the calculation wrt. batch and height
2086 // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
2087 // [BxCxH,W] as well if (m.dim.rank() > 2) {
2088 // throw exception::not_supported("Error: support only for rank of dot "
2092 // Comment out with intension to support the calculation wrt. batch and height
2093 // direction of this tensor. It is OK as long as m is 2D
2095 if (trans && dim.rank() > 2) {
2096 ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
2098 unsigned int dim1, dim2, mdim1, mdim2;
2099 if (getFormat() == Tformat::NHWC) {
2100 dim1 = batch() * height() * width();
2102 mdim1 = m.batch() * m.height() * m.width();
2103 mdim2 = m.channel();
2105 dim1 = batch() * channel() * height();
2107 mdim1 = m.batch() * m.channel() * m.height();
2111 unsigned int M, N, K, lda, ldb, ldc;
2113 if (!trans && !trans_m) {
2115 throw std::runtime_error(
2116 "Error: incompatible dimensions for dot product");
2117 K = mdim1; /** == dim2 */
2120 if (getFormat() == Tformat::NHWC) {
2121 CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2122 getTensorType()); // NHWC Result Tensor
2124 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2128 // We are not set zero the result because of performance reason.
2129 // However, result is not initialized properly. There might include
2130 // garbage like nan. When we have to use this value as in C = alpha*A*B +
2131 // beta*C, then have to check garbage data of C is not effect or not.
2133 } else if (!trans && trans_m) {
2135 throw std::runtime_error(
2136 "Error: incompatible dimensions for dot product");
2137 K = mdim2; /** == dim2 */
2140 if (getFormat() == Tformat::NHWC) {
2141 CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2144 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2146 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2148 CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2151 } else if (trans && !trans_m) {
2153 throw std::runtime_error(
2154 "Error: incompatible dimensions for dot product");
2155 K = mdim1; /** == dim1 */
2158 if (getFormat() == Tformat::NHWC) {
2159 CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2161 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2165 throw std::runtime_error(
2166 "Error: incompatible dimensions for dot product");
2167 K = mdim2; /** == dim1 */
2170 if (getFormat() == Tformat::NHWC) {
2171 CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2173 CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2178 ldc = (getFormat() == Tformat::NHWC) ? result.channel() : result.width();
2180 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2181 const float *data = getData();
2182 const float *mdata = m.getData();
2183 float *rdata = result.getData();
2184 const float alpha = 1.0f;
2185 enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2186 enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2188 /// shortcut handling in case of vector
2189 /// for vector, (1 * K) == (K * 1) in current memory layout...
2190 /// and plaese note that N, K, M is a fixed place holder after considering
2192 /// For example, there is no case like (1 * K) X (1 * K) while
2193 /// (1 * K) X (1 * M) can be a case
2194 /// case1: (1 * K) X (K * 1)
2195 if (M == 1 && N == 1) {
2196 *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2198 /// case2: (M * K) X (K * 1)
2200 sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2203 /// case3: (1 * K) X (K * N) = 1 * N = R
2204 /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2205 /// Effectively a translation of sgemv
2207 transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2208 sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2211 /// case others: use gemm
2213 sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2214 ldb, beta, rdata, ldc);
2216 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2218 const __fp16 *data = getData<__fp16>();
2219 const __fp16 *mdata = m.getData<__fp16>();
2220 __fp16 *rdata = result.getData<__fp16>();
2221 const float alpha = 1.0f;
2222 enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2223 enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2225 /// shortcut handling in case of vector
2226 /// for vector, (1 * K) == (K * 1) in current memory layout...
2227 /// and plaese note that N, K, M is a fixed place holder after considering
2229 /// For example, there is no case like (1 * K) X (1 * K) while
2230 /// (1 * K) X (1 * M) can be a case
2231 /// case1: (1 * K) X (K * 1)
2232 if (M == 1 && N == 1) {
2233 *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2235 /// case2: (M * K) X (K * 1)
2237 sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2240 /// case3: (1 * K) X (K * N) = 1 * N = R
2241 /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2242 /// Effectively a translation of sgemv
2244 transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2245 sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2248 /// case others: use sgemm
2250 sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2251 ldb, beta, rdata, ldc);
2254 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2261 Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
2262 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2263 << getName() << " is not contiguous. Cannot transpose.";
2265 if (out.getData() == getData()) {
2266 Tensor tmp = clone();
2267 return tmp.transpose(direction, out);
2270 unsigned int SL, SI, SJ, SK;
2272 out.reshape(dim.transpose(direction));
2274 int indexI = direction[0] - '0';
2275 int indexJ = direction[2] - '0';
2277 SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
2279 bool is_format_nchw = (getFormat() == Tformat::NCHW);
2281 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2282 const float *inptr = getData();
2283 float *outptr = out.getData();
2287 if (is_format_nchw) {
2288 transposeloop(l, i, j, k, SL, SI, SJ, SK);
2290 transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2293 if (is_format_nchw) {
2294 transposeloop(l, i, k, j, SL, SI, SK, SJ);
2296 transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2302 if (is_format_nchw) {
2303 transposeloop(l, j, i, k, SL, SJ, SI, SK);
2305 transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2308 if (is_format_nchw) {
2309 transposeloop(l, j, k, i, SL, SJ, SK, SI);
2311 transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2317 if (is_format_nchw) {
2318 transposeloop(l, k, i, j, SL, SK, SI, SJ);
2320 transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2323 if (is_format_nchw) {
2324 transposeloop(l, k, j, i, SL, SK, SJ, SI);
2326 transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2331 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2333 const __fp16 *inptr = getData<__fp16>();
2334 __fp16 *outptr = out.getData<__fp16>();
2338 if (is_format_nchw) {
2339 transposeloop(l, i, j, k, SL, SI, SJ, SK);
2341 transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2344 if (is_format_nchw) {
2345 transposeloop(l, i, k, j, SL, SI, SK, SJ);
2347 transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2353 if (is_format_nchw) {
2354 transposeloop(l, j, i, k, SL, SJ, SI, SK);
2356 transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2359 if (is_format_nchw) {
2360 transposeloop(l, j, k, i, SL, SJ, SK, SI);
2362 transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2368 if (is_format_nchw) {
2369 transposeloop(l, k, i, j, SL, SK, SI, SJ);
2371 transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2374 if (is_format_nchw) {
2375 transposeloop(l, k, j, i, SL, SK, SJ, SI);
2377 transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2383 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2390 Tensor Tensor::transpose(const std::string &direction) const {
2392 transpose(direction, result);
2396 Tensor Tensor::dropout_mask(float dropout) const {
2398 result.dropout_mask(dropout);
2402 void Tensor::dropout_mask(float dropout) {
2403 setRandUniform(0.0, 1.0);
2404 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2405 float scale = 1.0 / (1 - dropout);
2406 float *data_ = getData();
2407 for (unsigned int i = 0; i < size(); ++i) {
2408 if (data_[i] >= dropout)
2413 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2415 __fp16 scale = 1.0 / (1 - dropout);
2416 __fp16 *data_ = getData<__fp16>();
2417 for (unsigned int i = 0; i < size(); ++i) {
2418 if (data_[i] >= dropout)
2424 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2429 void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
2430 float fill_mask_val = 0.0;
2431 float en_mask_val = 1.0 - fill_mask_val;
2434 fill_mask_val = 1.0;
2435 en_mask_val = 1.0 - fill_mask_val;
2438 setValue(fill_mask_val);
2439 if (mask_len.batch() != batch())
2440 throw std::invalid_argument("Number of filter masks mismatched");
2441 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2442 for (unsigned int b = 0; b < batch(); b++) {
2443 float *addr = getAddress(b, 0, 0, 0);
2444 const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2445 std::fill(addr, addr + (*mask_len_val), en_mask_val);
2447 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2449 for (unsigned int b = 0; b < batch(); b++) {
2450 __fp16 *addr = getAddress<__fp16>(b, 0, 0, 0);
2451 const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2452 std::fill(addr, addr + (*mask_len_val), (__fp16)en_mask_val);
2455 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2460 Tensor Tensor::zoneout_mask(float zoneout) {
2461 Tensor ret(getDim());
2462 zoneout_mask(ret, zoneout);
2466 void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
2467 if (dim != opposite.dim) {
2468 throw std::invalid_argument(
2469 "[Tensor::zoneout_mask] opposite dimension does not match");
2472 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2473 opposite.setRandBernoulli(zoneout);
2475 float *data = getData();
2476 float *opposite_data = opposite.getData();
2478 for (unsigned int i = 0; i < size(); ++i) {
2479 if (opposite_data[i] > epsilon) {
2485 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2487 __fp16 zoneout_fp16 = (__fp16)zoneout;
2488 opposite.setRandBernoulli(zoneout_fp16);
2490 __fp16 *data = getData<__fp16>();
2491 __fp16 *opposite_data = opposite.getData<__fp16>();
2493 for (unsigned int i = 0; i < size(); ++i) {
2494 if (opposite_data[i] > epsilon) {
2495 data[i] = (__fp16)0.0;
2497 data[i] = (__fp16)1.0;
2501 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2506 // int Tensor::apply_i(std::function<float(float)> f) {
2507 // Tensor result = *this;
2508 // apply(f, result);
2510 // return ML_ERROR_NONE;
2513 // Tensor Tensor::apply(std::function<float(float)> f) const {
2515 // return apply(f, result);
2518 // Tensor &Tensor::apply(std::function<float(float)> f, Tensor &output) const {
2519 // CREATE_IF_EMPTY_DIMS(output, dim);
2521 // if (dim != output.dim) {
2522 // /// @todo add unittest
2523 // throw std::invalid_argument(
2524 // "[Tensor::apply] output dimension does not match");
2527 // if (contiguous && output.contiguous) {
2528 // const float *data = getData();
2529 // float *rdata = output.getData();
2530 // std::transform(data, data + size(), rdata, f);
2531 // } else if (strides[3] == 1 && output.strides[3] == 1) {
2532 // /** @todo optimize this with combining these loops where stride is 1 */
2533 // for (unsigned int b = 0; b < batch(); ++b) {
2534 // for (unsigned int c = 0; c < channel(); ++c) {
2535 // for (unsigned int h = 0; h < height(); ++h) {
2536 // float *out_data = output.getAddress(b, c, h, 0);
2537 // const float *in_data = getAddress(b, c, h, 0);
2538 // std::transform(in_data, in_data + width(), out_data, f);
2543 // for (unsigned int b = 0; b < batch(); ++b) {
2544 // for (unsigned int c = 0; c < channel(); ++c) {
2545 // for (unsigned int h = 0; h < height(); ++h) {
2546 // for (unsigned int w = 0; w < width(); ++w) {
2547 // output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
2557 Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
2559 Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
2560 Tensor &output) const {
2561 return f(*this, output);
2564 void Tensor::print(std::ostream &out) const {
2565 printInstance(out, this);
2566 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2567 const float *data = getData<float>();
2568 unsigned int len = size();
2569 out << "data addr: " << data << '\n';
2573 out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2574 << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2575 << ']' << std::endl;
2579 std::ios init(NULL);
2581 if (getFormat() == Tformat::NCHW) {
2582 for (unsigned int k = 0; k < batch(); k++) {
2583 for (unsigned int l = 0; l < channel(); l++) {
2584 for (unsigned int i = 0; i < height(); i++) {
2585 for (unsigned int j = 0; j < width(); j++) {
2586 out << std::setw(10) << std::setprecision(10)
2587 << this->getValue<float>(k, l, i, j) << " ";
2593 out << "-------" << std::endl;
2596 for (unsigned int k = 0; k < batch(); k++) {
2597 for (unsigned int i = 0; i < height(); i++) {
2598 for (unsigned int j = 0; j < width(); j++) {
2599 for (unsigned int l = 0; l < channel(); l++) {
2600 out << std::setw(10) << std::setprecision(10)
2601 << this->getValue<float>(k, l, i, j) << " ";
2607 out << "-------" << std::endl;
2611 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2613 const __fp16 *data = getData<__fp16>();
2614 unsigned int len = size();
2615 out << "data addr: " << data << '\n';
2619 out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2620 << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2621 << ']' << std::endl;
2625 std::ios init(NULL);
2627 if (getFormat() == Tformat::NCHW) {
2628 for (unsigned int k = 0; k < batch(); k++) {
2629 for (unsigned int l = 0; l < channel(); l++) {
2630 for (unsigned int i = 0; i < height(); i++) {
2631 for (unsigned int j = 0; j < width(); j++) {
2632 out << std::setw(10) << std::setprecision(10)
2633 << this->getValue<__fp16>(k, l, i, j) << " ";
2639 out << "-------" << std::endl;
2642 for (unsigned int k = 0; k < batch(); k++) {
2643 for (unsigned int i = 0; i < height(); i++) {
2644 for (unsigned int j = 0; j < width(); j++) {
2645 for (unsigned int l = 0; l < channel(); l++) {
2646 out << std::setw(10) << std::setprecision(10)
2647 << this->getValue<__fp16>(k, l, i, j) << " ";
2653 out << "-------" << std::endl;
2658 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2663 void Tensor::print_(std::ostream &out, uint opt) const {
2664 printInstance(out, this);
2665 const float *data = getData();
2667 unsigned int len = size();
2669 std::ios init(NULL);
2672 if (getFormat() == Tformat::NCHW) {
2674 for (unsigned int k = 0; k < batch(); k++) {
2676 for (unsigned int i = 0; i < channel(); i++) {
2678 for (unsigned int j = 0; j < height(); j++) {
2680 for (unsigned int l = 0; l < width(); l++) {
2681 if (l < channel() - 1)
2682 out << std::setw(10) << std::setprecision(10)
2683 << this->getValue<float>(k, l, i, j) << ", ";
2685 out << std::setw(10) << std::setprecision(10)
2686 << this->getValue<float>(k, l, i, j);
2688 if (j < height() - 1)
2694 if (i < channel() - 1)
2700 if (k < batch() - 1)
2709 for (unsigned int k = 0; k < batch(); k++) {
2711 for (unsigned int i = 0; i < height(); i++) {
2713 for (unsigned int j = 0; j < width(); j++) {
2715 for (unsigned int l = 0; l < channel(); l++) {
2716 if (l < channel() - 1)
2717 out << std::setw(10) << std::setprecision(10)
2718 << this->getValue<float>(k, l, i, j) << ", ";
2720 out << std::setw(10) << std::setprecision(10)
2721 << this->getValue<float>(k, l, i, j);
2723 if (j < width() - 1)
2729 if (i < height() - 1)
2735 if (k < batch() - 1)
2744 for (uint i = 0; i < len; ++i) {
2745 out << getData<float>()[i] << ", ";
2750 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
2755 void Tensor::copy(const void *buf) {
2756 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2757 << getName() << "Tensor is not contiguous, cannot copy.";
2759 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2761 if (buf == getData<__fp16>()) {
2765 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2767 } else if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2768 if (buf == getData()) {
2772 // std::string type_ =
2773 // (getDataType() == ml::train::TensorDim::DataType::FP16) ? "FP16" : "NO";
2774 // std::cout << type_ << std::endl;
2776 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2778 scopy(size(), (__fp16 *)buf, 1, getData<__fp16>(), 1);
2780 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2782 } else if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2783 scopy(size(), (float *)buf, 1, getData<float>(), 1);
2787 void Tensor::copy_with_stride(const Tensor &from) {
2789 if (dim == from.getDim()) {
2790 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2791 for (unsigned int b = 0; b < batch(); ++b) {
2792 for (unsigned int c = 0; c < channel(); ++c) {
2793 for (unsigned int h = 0; h < height(); ++h) {
2794 for (unsigned int w = 0; w < width(); ++w) {
2795 setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2800 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
2802 for (unsigned int b = 0; b < batch(); ++b) {
2803 for (unsigned int c = 0; c < channel(); ++c) {
2804 for (unsigned int h = 0; h < height(); ++h) {
2805 for (unsigned int w = 0; w < width(); ++w) {
2806 setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2812 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2816 Tensor t = Tensor(from.getDim(), true);
2817 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2818 for (unsigned int b = 0; b < t.batch(); ++b) {
2819 for (unsigned int c = 0; c < t.channel(); ++c) {
2820 for (unsigned int h = 0; h < t.height(); ++h) {
2821 for (unsigned int w = 0; w < t.width(); ++w) {
2822 t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2827 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
2829 for (unsigned int b = 0; b < batch(); ++b) {
2830 for (unsigned int c = 0; c < channel(); ++c) {
2831 for (unsigned int h = 0; h < height(); ++h) {
2832 for (unsigned int w = 0; w < width(); ++w) {
2833 setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2839 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2846 void Tensor::copy(const Tensor &from) {
2847 // todo: enable copy to non-contiguous tensor
2849 throw std::runtime_error("Cannot copy non-contiguous tensor");
2852 if (from.size() != 0 && size() == from.size() &&
2853 getDataType() == from.getDataType()) {
2854 reshape(from.getDim());
2855 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2856 copy(from.getData());
2857 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2859 copy(from.getData<__fp16>());
2861 throw std::invalid_argument("Error: enable-fp16 is not enabled");
2866 Tensor t = Tensor(from.getDim(), from.getData());
2871 void Tensor::copyData(const Tensor &from) {
2872 // todo: enable copy to non-contiguous tensor
2874 throw std::runtime_error("Cannot copy non-contiguous tensor");
2877 if (size() != from.size())
2878 throw std::invalid_argument("Size of tensor to copy must match");
2880 if (getDataType() != from.getDataType())
2881 throw std::invalid_argument("Data type of tensor to copy must match");
2883 copy(from.getData());
2886 Tensor Tensor::clone() const {
2893 void Tensor::reshape(const TensorDim &d) {
2895 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2896 << getName() << " is not contiguous, cannot reshape.";
2898 NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
2899 << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
2901 << getDim() << " to " << d;
2904 strides = d.computeStrides();
2907 void Tensor::fill(const Tensor &from, bool alloc) {
2908 if (alloc && this->empty()) {
2913 if (!from.contiguous || !contiguous) {
2914 /// @todo enable this if needed
2915 throw nntrainer::exception::not_supported(
2916 "[Tensor::fill] non-contiguous tensors are not supported");
2919 if (dim != from.getDim()) {
2920 throw std::invalid_argument("[Tensor::fill] dimension must be the same");
2923 if (strides != from.getStrides()) {
2924 /// @todo length does not represent buffer size, there should be way to
2925 /// get the buffer size
2926 throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
2929 this->copy(from.getData());
2932 void Tensor::save(std::ostream &file) {
2933 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2934 << getName() << " is not contiguous, cannot save.";
2936 std::streamsize sz = static_cast<std::streamsize>(bytes());
2937 NNTR_THROW_IF(sz < 0, std::invalid_argument)
2938 << "save size: " << bytes()
2939 << " is too big. It cannot be represented by std::streamsize";
2941 checkedWrite(file, (char *)getData(), sz, "[Tensor::save] operation failed");
2945 void Tensor::read(std::ifstream &file) {
2946 NNTR_THROW_IF(!contiguous, std::invalid_argument)
2947 << getName() << " is not contiguous, cannot read.";
2949 std::streamsize sz = static_cast<std::streamsize>(bytes());
2951 NNTR_THROW_IF(sz < 0, std::invalid_argument)
2952 << "read size: " << bytes()
2953 << " is too big. It cannot be represented by std::streamsize";
2955 checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
2960 * @brief Calculate average value according to the axis.
2962 Tensor Tensor::average(unsigned int axis) const {
2963 Tensor t("", this->getFormat(), this->getDataType());
2964 return average(axis, t);
2968 * @brief Calculate average value according to the axis.
2970 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
2971 if (axis >= TensorDim::MAXDIM)
2972 throw std::out_of_range(
2973 "negative axis or axis more then MAXDIM is invalid");
2975 unsigned int axis_size = dim.getDim()[axis];
2979 this->sum(axis, output, 1.0 / ((float)axis_size));
2984 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
2985 Tensor t("", this->getFormat(), this->getDataType());
2986 return average(axes, t);
2989 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
2990 Tensor &output) const {
2992 return this->average(output);
2994 TensorDim ret_shape(getTensorType());
2996 for (const auto &idx : axes) {
2997 if (idx >= TensorDim::MAXDIM) {
2998 throw std::out_of_range("axis more then MAXDIM is invalid");
3000 ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
3003 return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
3007 * @brief Calculate average value according to the axis.
3009 Tensor Tensor::average() const {
3010 Tensor result = *this;
3011 unsigned int axis = 0;
3012 if (this->getFormat() == Tformat::NHWC) {
3013 result.reshape({1, dim.getDataLen(), 1, 1, this->getTensorType()});
3016 result.reshape({1, 1, 1, dim.getDataLen(), this->getTensorType()});
3019 return result.average(axis);
3023 * @brief Calculate average value according to the axis.
3025 Tensor &Tensor::average(Tensor &output) const {
3026 Tensor result = *this;
3027 result.reshape({1, 1, 1, dim.getDataLen()});
3028 return result.average(3, output);
3031 void Tensor::setValue(float val) {
3032 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3033 << getName() << " is not contiguous, cannot set value.";
3034 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3035 float *data = getData<float>();
3036 std::fill(data, data + size(), val);
3037 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3039 __fp16 *data = getData<__fp16>();
3040 std::fill(data, data + size(), (__fp16)val);
3042 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3047 void Tensor::setZero() {
3048 if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
3050 sscal(size(), 0, getData<float>(), 1);
3052 apply_i([](float val) -> float { return 0; });
3053 } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
3056 sscal(size(), 0, getData<__fp16>(), 1);
3058 apply_i([](__fp16 val) -> __fp16 { return 0; });
3060 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3065 std::vector<unsigned int> Tensor::argmax() const {
3066 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3067 << getName() << " is not contiguous, cannot get argmax.";
3068 std::vector<unsigned int> result;
3070 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3071 const float *data = getData();
3072 size_t batch_size = batch();
3073 size_t feature_len = dim.getFeatureLen();
3075 result.resize(batch_size);
3077 for (unsigned int b = 0; b < batch_size; b++) {
3079 std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
3080 result[b] = std::distance(data, max_iter) - (b * feature_len);
3083 if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3085 const __fp16 *data = getData<__fp16>();
3086 size_t batch_size = batch();
3087 size_t feature_len = dim.getFeatureLen();
3089 result.resize(batch_size);
3091 for (unsigned int b = 0; b < batch_size; b++) {
3093 std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
3094 result[b] = std::distance(data, max_iter) - (b * feature_len);
3097 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3104 float Tensor::l2norm() const {
3105 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3106 << getName() << " is not contiguous, cannot get l2norm.";
3108 unsigned int len = size();
3109 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3110 const float *data = getData<float>();
3111 ret = snrm2(len, data, 1);
3112 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3114 const __fp16 *data = getData<__fp16>();
3115 ret = snrm2(len, data, 1);
3117 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3123 float Tensor::max_abs() const {
3124 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3125 << getName() << " is not contiguous, cannot get max_abs.";
3127 unsigned int len = size();
3129 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3130 const float *data = getData<float>();
3132 unsigned int idx = isamax(len, data, 1);
3133 ret = *(data + idx);
3135 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3137 const __fp16 *data = getData<__fp16>();
3139 unsigned int idx = isamax(len, data, 1);
3140 ret = *(data + idx);
3142 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3148 Tensor &Tensor::normalization(Tensor &output) const {
3150 output = Tensor(dim);
3153 output.normalization_i();
3158 void Tensor::normalization_i() {
3159 NNTR_THROW_IF(!contiguous, std::invalid_argument)
3160 << getName() << " is not contiguous, cannot do normalization.";
3162 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3163 const float *data = getData();
3165 auto bounds = std::minmax_element(data, data + size());
3166 const float min = *bounds.first;
3167 const float max = *bounds.second;
3171 this->subtract_i(tmp);
3173 this->subtract_i(min);
3174 this->divide_i(max - min);
3176 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3178 const __fp16 *data = getData<__fp16>();
3180 auto bounds = std::minmax_element(data, data + size());
3181 const __fp16 min = *bounds.first;
3182 const __fp16 max = *bounds.second;
3186 this->subtract_i(tmp);
3188 this->subtract_i(min);
3189 this->divide_i(max - min);
3192 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3197 LazyTensor Tensor::chain() const { return LazyTensor(*this); }
3199 Tensor &Tensor::standardization(Tensor &output) const {
3201 output = Tensor(dim);
3204 output.standardization_i();
3209 void Tensor::standardization_i() {
3210 Tensor mean_by_batch = this->sum_by_batch();
3211 mean_by_batch.divide_i(dim.getFeatureLen());
3213 this->subtract_i(mean_by_batch);
3214 if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3215 Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3216 std_dev_by_batch.setZero();
3217 float *std_dev = std_dev_by_batch.getData();
3219 for (unsigned int k = 0; k < dim.batch(); ++k) {
3220 Tensor sub_this = this->getBatchSlice(k, 1);
3221 std_dev[k] = sub_this.l2norm();
3224 std_dev_by_batch.divide_i(dim.getFeatureLen());
3225 this->divide_i(std_dev_by_batch);
3226 } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3228 Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3229 std_dev_by_batch.setZero();
3230 __fp16 *std_dev = std_dev_by_batch.getData<__fp16>();
3232 for (unsigned int k = 0; k < dim.batch(); ++k) {
3233 Tensor sub_this = this->getBatchSlice(k, 1);
3234 std_dev[k] = sub_this.l2norm();
3237 std_dev_by_batch.divide_i(dim.getFeatureLen());
3238 this->divide_i(std_dev_by_batch);
3240 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3245 Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
3246 if (m.size() > this->size())
3247 throw exception::not_supported("broadcasting *this is not supported");
3249 const TensorDim m_dim = m.getDim();
3252 e.tensor_type = getTensorType();
3254 uint continuity[4] = {0, 1, 2, 3};
3255 if (getFormat() == Tformat::NHWC) {
3261 /// checking if given Tensor's can be broadcasted
3262 for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
3263 if (dim.getTensorDim(continuity[i]) == m_dim.getTensorDim(continuity[i])) {
3264 e.strides[i] = m.strides[i];
3268 /// If given dimension is 1, it could be reused, the stride remaining 0
3269 /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
3270 /// If so, strides should not change
3271 if (m_dim.getTensorDim(continuity[i]) == 1) {
3275 std::stringstream ss;
3276 ss << "[computeBroadcastInfo] broadcasting only allowed for "
3277 "dimension value of 1 \n"
3278 << "this: " << dim << "target: " << m_dim;
3279 throw std::invalid_argument(ss.str().c_str());
3282 /// calculate inner loop size
3285 e.strides[3] = m.strides[3];
3287 /// initiate buffer info with matching dimension strategy
3288 for (int axis = 3; axis >= 0; --axis) {
3289 if (dim.getTensorDim(continuity[axis]) !=
3290 m_dim.getTensorDim(continuity[axis])) {
3291 e.buffer_axis = axis;
3295 e.buffer_size *= dim.getTensorDim(continuity[axis]);
3298 /// check strategy that uses consecutive ones
3299 if (m_dim.getTensorDim(continuity[3]) == 1) {
3300 unsigned int inner_loop_size = 1;
3302 for (axis = 3; axis >= 0; --axis) {
3303 if (m_dim.getTensorDim(continuity[axis]) != 1) {
3307 inner_loop_size *= dim.getTensorDim(continuity[axis]);
3310 /// if consecutive-one strategy has bigger chunk size, replace the
3312 if (inner_loop_size > e.buffer_size) {
3313 e.buffer_axis = axis;
3314 e.buffer_size = inner_loop_size;
3322 Tensor Tensor::rotate_180(Tensor in) {
3323 Tensor output(in.getDim());
3324 if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
3326 for (unsigned int i = 0; i < in.batch(); ++i) {
3327 for (unsigned int j = 0; j < in.channel(); ++j) {
3328 for (unsigned int k = 0; k < in.height(); ++k) {
3329 for (unsigned int l = 0; l < in.width(); ++l) {
3330 output.setValue(i, j, k, l,
3331 in.getValue<float>(i, j, (in.height() - k - 1),
3332 (in.width() - l - 1)));
3337 } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
3340 for (unsigned int i = 0; i < in.batch(); ++i) {
3341 for (unsigned int j = 0; j < in.channel(); ++j) {
3342 for (unsigned int k = 0; k < in.height(); ++k) {
3343 for (unsigned int l = 0; l < in.width(); ++l) {
3344 output.setValue(i, j, k, l,
3345 in.getValue<__fp16>(i, j, (in.height() - k - 1),
3346 (in.width() - l - 1)));
3352 throw std::invalid_argument("Error: enable-fp16 is not enabled");
3358 } /* namespace nntrainer */