nntrainer/tensor/tensor.h

   1 /**
   2  * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *   http://www.apache.org/licenses/LICENSE-2.0
   8  * Unless required by applicable law or agreed to in writing, software
   9  * distributed under the License is distributed on an "AS IS" BASIS,
  10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11  * See the License for the specific language governing permissions and
  12  * limitations under the License.
  13  *
  14  *
  15  * @file        tensor.h
  16  * @date        04 December 2019
  17  * @brief       This is Tensor class for calculation
  18  * @see         https://github.com/nnstreamer/nntrainer
  19  * @author      Jijoong Moon <jijoong.moon@samsung.com>
  20  * @bug         No known bugs except for NYI items
  21  *
  22  * @todo deprecate new tensor allocation for out of place operations.
  23  */
  24
  25 #ifndef __TENSOR_H__
  26 #define __TENSOR_H__
  27 #ifdef __cplusplus
  28
  29 #include <array>
  30 #include <functional>
  31 #include <memory>
  32 #include <random>
  33 #include <stdexcept>
  34 #include <vector>
  35
  36 #include <blas_interface.h>
  37 #include <iostream>
  38 #include <memory_data.h>
  39 #include <nntrainer_error.h>
  40 #include <tensor_dim.h>
  41 #include <util_func.h>
  42
  43 #ifdef DEBUG
  44 #define EXCEPT_WHEN_DEBUG
  45 #else
  46 #define EXCEPT_WHEN_DEBUG noexcept
  47 #endif
  48
  49 #define MAKE_SHARED_TENSOR(...) std::make_shared<nntrainer::Tensor>(__VA_ARGS__)
  50
  51 #define CREATE_IF_EMPTY_DIMS(tensor, ...) \
  52   do {                                    \
  53     if (tensor.empty())                   \
  54       tensor = Tensor(__VA_ARGS__);       \
  55   } while (0);
  56
  57 namespace nntrainer {
  58
  59 using TensorDim = ml::train::TensorDim;
  60 using Tformat = ml::train::TensorDim::Format;
  61 using Tdatatype = ml::train::TensorDim::DataType;
  62
  63 class LazyTensor;
  64 class SrcSharedTensor;
  65
  66 /**
  67  * @class   Tensor Class for Calculation
  68  * @brief   Tensor Class for Calculation
  69  */
  70 class Tensor {
  71 public:
  72   /**
  73    * @brief     Enumeration of Weight Initialization Type
  74    * @todo      support intialization from file
  75    */
  76   enum class Initializer {
  77     ZEROS,          /** Zero initialization */
  78     ONES,           /** One initialization */
  79     LECUN_NORMAL,   /** LeCun normal initialization */
  80     LECUN_UNIFORM,  /** uniform initialization */
  81     XAVIER_NORMAL,  /** Xavier normal initialization */
  82     XAVIER_UNIFORM, /** Xavier uniform initialization */
  83     HE_NORMAL,      /** He normal initialization */
  84     HE_UNIFORM,     /** He uniform initialization */
  85     NONE            /** No initialization */
  86   };
  87
  88   /**
  89    * @brief     Basic Constructor of Tensor
  90    */
  91   Tensor(std::string name_ = "", Tformat fm = Tformat::NCHW,
  92          Tdatatype d_type = Tdatatype::FP32) :
  93     dim(TensorDim(fm, d_type)),
  94     strides(dim.computeStrides()),
  95     contiguous(true),
  96     initializer(Initializer::NONE),
  97     name(name_),
  98     data(nullptr),
  99     offset(0),
 100     src_tensor() {}
 101
 102   /**
 103    * @brief     Constructor of Tensor with dimension, possibly lazily
 104    * @param d Tensor dim for this tensor
 105    * @param alloc_now If the memory of the tensor must be allocated
 106    * @param init Initializer for the tensor
 107    * @param name Name of the tensor
 108    */
 109   Tensor(const TensorDim &d, bool alloc_now,
 110          Initializer init = Initializer::NONE, std::string name = "");
 111
 112   /**
 113    * @brief     Constructor of Tensor with dimension/buf
 114    * @param d Tensor dim for this tensor
 115    * @param buf buffer
 116    * @note Memory for this tensor is instantaneously allocated
 117    */
 118   Tensor(const TensorDim &d, const void *buf = nullptr);
 119
 120   /**
 121    * @brief     Constructor of Tensor
 122    * @param[in] d0 Batch of Tensor
 123    * @param[in] d1 Channel
 124    * @param[in] d2 Height
 125    * @param[in] d3 Width
 126    */
 127   Tensor(size_t d0, size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 128          Tdatatype d_type = Tdatatype::FP32) :
 129     Tensor(TensorDim(d0, d1, d2, d3, fm, d_type), nullptr){};
 130
 131   /**
 132    * @brief     Constructor of Tensor
 133    * @param[in] d1 Channel
 134    * @param[in] d2 Height
 135    * @param[in] d3 Width
 136    */
 137   Tensor(size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 138          Tdatatype d_type = Tdatatype::FP32) :
 139     Tensor(1, d1, d2, d3, fm, d_type){};
 140
 141   /**
 142    * @brief     Constructor of Tensor with batch size one and d1 size one
 143    * @param[in] d2 Height (NCHW) or Width (NHWC)
 144    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 145    */
 146   Tensor(size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 147          Tdatatype d_type = Tdatatype::FP32) :
 148     Tensor(1, 1, d2, d3, fm, d_type){};
 149
 150   /**
 151    * @brief     Constructor of Tensor with just Width or Channel
 152    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 153    */
 154   explicit Tensor(size_t d3, Tformat fm = Tformat::NCHW,
 155                   Tdatatype d_type = Tdatatype::FP32) :
 156     Tensor(1, 1, 1, d3, fm, d_type){};
 157
 158   /**
 159    * @brief     Constructor of Tensor
 160    * @param[in] d0 Batch of Tensor
 161    * @param[in] d1 Channel (NCHW) or Height (NHWC)
 162    * @param[in] d2 Height (NCHW) or Width (NHWC)
 163    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 164    */
 165   Tensor(size_t d0, size_t d1, size_t d2, size_t d3,
 166          ml::train::TensorDim::TensorType t_type) :
 167     Tensor(TensorDim(d0, d1, d2, d3, t_type), nullptr){};
 168
 169   /**
 170    * @brief     Constructor of Tensor
 171    * @param[in] d1 Channel
 172    * @param[in] d2 Height
 173    * @param[in] d3 Width
 174    */
 175   Tensor(size_t d1, size_t d2, size_t d3,
 176          ml::train::TensorDim::TensorType t_type) :
 177     Tensor(1, d1, d2, d3, t_type){};
 178
 179   /**
 180    * @brief     Constructor of Tensor with batch size one and d1 size one
 181    * @param[in] d2 Height (NCHW) or Width (NHWC)
 182    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 183    */
 184   Tensor(size_t d2, size_t d3, ml::train::TensorDim::TensorType t_type) :
 185     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3,
 186            (t_type.format == Tformat::NCHW) ? d2 : 1,
 187            (t_type.format == Tformat::NCHW) ? d3 : d2, t_type){};
 188   /**
 189    * @brief     Constructor of Tensor with just Width or Channel
 190    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 191    */
 192   explicit Tensor(size_t d3, ml::train::TensorDim::TensorType t_type) :
 193     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3, 1,
 194            (t_type.format == Tformat::NCHW) ? d3 : 1, t_type){};
 195
 196   /**
 197    * @brief     Constructor of Tensor
 198    * @param[in] d data for the Tensor. It needs to set format properly.
 199    */
 200
 201   Tensor(std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
 202          ml::train::TensorDim::TensorType t_type) {
 203     if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
 204       throw std::out_of_range(
 205         "[Tensor] trying to initialize Tensor from empty vector");
 206     }
 207     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 208     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 209     // dim[1] == height, dim[2] == width, dim[3] == channel
 210     dim.setTensorDim(0, d.size());
 211     if (t_type.format == Tformat::NCHW) {
 212       dim.setTensorDim(1, d[0].size());
 213       dim.setTensorDim(2, d[0][0].size());
 214       dim.setTensorDim(3, d[0][0][0].size());
 215     } else {
 216       dim.setTensorDim(2, d[0].size());
 217       dim.setTensorDim(3, d[0][0].size());
 218       dim.setTensorDim(1, d[0][0][0].size());
 219     }
 220
 221     setTensorType(t_type);
 222
 223     strides = dim.computeStrides();
 224
 225     MemoryData *mem_data =
 226       new MemoryData((void *)(new float[dim.getDataLen()]()));
 227     data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
 228       delete[] mem_data->getAddr<float>();
 229     });
 230     offset = 0;
 231     contiguous = true;
 232     initializer = Initializer::NONE;
 233
 234     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 235     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 236     // dim[1] == height, dim[2] == width, dim[3] == channel
 237     if (t_type.format == Tformat::NCHW) {
 238       for (unsigned int i = 0; i < batch(); ++i)
 239         for (unsigned int j = 0; j < channel(); ++j)
 240           for (unsigned int k = 0; k < height(); ++k)
 241             for (unsigned int l = 0; l < width(); ++l)
 242               this->setValue(i, j, k, l, d[i][j][k][l]);
 243     } else {
 244       for (unsigned int i = 0; i < batch(); ++i)
 245         for (unsigned int j = 0; j < height(); ++j)
 246           for (unsigned int k = 0; k < width(); ++k)
 247             for (unsigned int l = 0; l < channel(); ++l)
 248               this->setValue(i, l, j, k, d[i][j][k][l]);
 249     }
 250   };
 251
 252   /**
 253    * @brief     Constructor of Tensor
 254    * @note      This constructor copies vector again. needs refactoring
 255    * @param[in] d data for the Tensor. It needs to set format properly.
 256    */
 257   Tensor(std::vector<std::vector<std::vector<float>>> const &d,
 258          ml::train::TensorDim::TensorType t_type) :
 259     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 260
 261   /**
 262    * @brief     Constructor of Tensor
 263    * @note      This constructor copies vector again. needs refactoring
 264    * @param[in] d data for the Tensor with batch size one
 265    */
 266   Tensor(std::vector<std::vector<float>> const &d,
 267          ml::train::TensorDim::TensorType t_type) :
 268     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 269
 270   Tensor(std::vector<std::vector<std::vector<std::vector<__fp16>>>> const &d,
 271          ml::train::TensorDim::TensorType t_type) {
 272
 273     if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
 274       throw std::out_of_range(
 275         "[Tensor] trying to initialize Tensor from empty vector");
 276     }
 277
 278     dim.setTensorDim(0, d.size());
 279     if (t_type.format == Tformat::NCHW) {
 280       dim.setTensorDim(1, d[0].size());
 281       dim.setTensorDim(2, d[0][0].size());
 282       dim.setTensorDim(3, d[0][0][0].size());
 283     } else {
 284       dim.setTensorDim(2, d[0].size());
 285       dim.setTensorDim(3, d[0][0].size());
 286       dim.setTensorDim(1, d[0][0][0].size());
 287     }
 288
 289     setTensorType(t_type);
 290
 291     strides = dim.computeStrides();
 292
 293     MemoryData *mem_data =
 294       new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
 295     data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
 296       delete[] mem_data->getAddr<__fp16>();
 297     });
 298     offset = 0;
 299     contiguous = true;
 300     initializer = Initializer::NONE;
 301
 302     setDataType(Tdatatype::FP16);
 303
 304     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 305     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 306     // dim[1] == height, dim[2] == width, dim[3] == channel
 307     if (t_type.format == Tformat::NCHW) {
 308       for (unsigned int i = 0; i < batch(); ++i)
 309         for (unsigned int j = 0; j < channel(); ++j)
 310           for (unsigned int k = 0; k < height(); ++k)
 311             for (unsigned int l = 0; l < width(); ++l)
 312               this->setValue(i, j, k, l, d[i][j][k][l]);
 313     } else {
 314       for (unsigned int i = 0; i < batch(); ++i)
 315         for (unsigned int j = 0; j < height(); ++j)
 316           for (unsigned int k = 0; k < width(); ++k)
 317             for (unsigned int l = 0; l < channel(); ++l)
 318               this->setValue(i, l, j, k, d[i][j][k][l]);
 319     }
 320   };
 321
 322   /**
 323    * @brief     Constructor of Tensor
 324    * @note      This constructor copies vector again. needs refactoring
 325    * @param[in] d data for the Tensor
 326    */
 327   Tensor(std::vector<std::vector<std::vector<__fp16>>> const &d,
 328          ml::train::TensorDim::TensorType t_type) :
 329     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 330
 331   /**
 332    * @brief     Constructor of Tensor
 333    * @note      This constructor copies vector again. needs refactoring
 334    * @param[in] d data for the Tensor with batch size one
 335    */
 336   Tensor(std::vector<std::vector<__fp16>> const &d,
 337          ml::train::TensorDim::TensorType t_type) :
 338     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 339
 340   /**
 341    *  @brief  Copy constructor of Tensor.
 342    *  @param[in] Tensor &
 343    */
 344   Tensor(const Tensor &rhs) = default;
 345
 346   /**
 347    *  @brief  Move constructor of Tensor.
 348    *  @param[in] Tensor &&
 349    */
 350   Tensor(Tensor &&rhs) noexcept = default;
 351
 352   /**
 353    * @brief  Copy assignment operator.
 354    * @param[in] rhs Tensor to be copied.
 355    */
 356   Tensor &operator=(const Tensor &rhs) = default;
 357
 358   /**
 359    * @brief  Move assignment operator.
 360    * @parma[in] rhs Tensor to be moved.
 361    */
 362   Tensor &operator=(Tensor &&rhs) noexcept = default;
 363
 364   /**
 365    * @brief Construct a new Tensor object from a buffer
 366    * This will not copy buffer to a new tensor but directly uses it
 367    *
 368    * @param buf buffer
 369    * @param bytes buffer size in bytes
 370    * @param d tensor dim
 371    * @param offset offset to be used from current
 372    * @return Tensor object
 373    * @throws std::invalid_argument if buf is null
 374    */
 375   template <typename T = float>
 376   static Tensor Map(T *buf, unsigned int bytes, const TensorDim &d,
 377                     size_t offset = 0) {
 378     if (d.getDataLen() == 0 || buf == nullptr) {
 379       throw std::invalid_argument(
 380         "[Tensor::Map] empty tensor dim is not allowed");
 381     }
 382
 383     if (d.getDataLen() * sizeof(T) + offset > bytes) {
 384       throw std::invalid_argument(
 385         "Creating shared tensor of size bigger than tensor memory.");
 386     }
 387
 388     Tensor tmp;
 389     tmp.dim = d;
 390     tmp.strides = d.computeStrides();
 391     /// Tensor does not own the memory
 392     tmp.data = std::shared_ptr<MemoryData>(new MemoryData((void *)buf),
 393                                            std::default_delete<MemoryData>());
 394     tmp.offset = offset;
 395
 396     return tmp;
 397   };
 398
 399   friend void swap(Tensor &lhs, Tensor &rhs) noexcept {
 400     std::swap(lhs.dim, rhs.dim);
 401     std::swap(lhs.strides, rhs.strides);
 402     std::swap(lhs.contiguous, rhs.contiguous);
 403     std::swap(lhs.initializer, rhs.initializer);
 404     std::swap(lhs.data, rhs.data);
 405     std::swap(lhs.name, rhs.name);
 406   }
 407
 408   /**
 409    * @brief     Comparison operator overload
 410    * @param[in] rhs Tensor to be compared with
 411    */
 412   bool operator==(const Tensor &rhs) const;
 413
 414   /**
 415    * @brief     Comparison operator overload
 416    * @param[in] rhs Tensor to be compared with
 417    */
 418   bool operator!=(const Tensor &rhs) const { return !(*this == rhs); }
 419
 420   /**
 421    * @brief    Allocate memory for this tensor
 422    */
 423   void allocate();
 424
 425   /**
 426    * @brief    Deallocate memory for this tensor
 427    * @note     This will not necessary free the memory as tensors share memory
 428    */
 429   void deallocate() {
 430     data = nullptr;
 431     offset = 0;
 432   }
 433
 434   /**
 435    * @brief    Check if the tensor has memory allocated/assigned/associated
 436    */
 437   bool isAllocated() const { return data != nullptr; }
 438
 439   /**
 440    * @brief     return value at specific location
 441    * @param[in] batch batch location
 442    * @param[in] c channel location
 443    * @param[in] h height location
 444    * @param[in] w width location
 445    */
 446   template <typename T = float>
 447   const T &getValue(unsigned int batch, unsigned int c, unsigned int h,
 448                     unsigned int w) const noexcept {
 449     return getValue<T>(getIndex(batch, c, h, w));
 450   }
 451
 452   template <typename T = float>
 453   T &getValue(unsigned int batch, unsigned int c, unsigned int h,
 454               unsigned int w) noexcept {
 455     return getValue<T>(getIndex(batch, c, h, w));
 456   }
 457
 458   /**
 459    * @brief     return value at specific location
 460    * @param[in] idx location
 461    */
 462   template <typename T = float>
 463   const T &getValue(unsigned int idx) const noexcept {
 464     return getData<T>()[idx];
 465   }
 466
 467   /**
 468    * @brief     return value at specific location
 469    * @param[in] idx location
 470    */
 471   template <typename T = float> T &getValue(unsigned int idx) noexcept {
 472     return getData<T>()[idx];
 473   }
 474
 475   /**
 476    * @brief Get the Value thinking that it is padded
 477    * for example, for the tensor (virtually padded) below,
 478    * getValue(0, 0, 2, 2, 1, 1, .0f) will return 5
 479    * padding available for height and width axis for now
 480    * 0 0 0 0 0
 481    * 0 1 2 3 0
 482    * 0 4 5 6 0
 483    * 0 7 8 9 0
 484    * 0 0 0 0 0
 485    * @param b batch index
 486    * @param c channel index
 487    * @param h height index
 488    * @param w width index
 489    * @param ph padding height
 490    * @param pw padding width
 491    * @return float value
 492    */
 493   template <typename T = float>
 494   const T getValuePaddedVirtual(unsigned int b, unsigned int c, unsigned int h,
 495                                 unsigned int w, unsigned int ph,
 496                                 unsigned int pw,
 497                                 T pad_value = 0) const EXCEPT_WHEN_DEBUG {
 498 #if DEBUG
 499     unsigned int padded_h = 2 * ph + h;
 500     unsigned int padded_w = 2 * pw + w;
 501     if (h > padded_h && w > padded_w) {
 502       throw std::out_of_range(
 503         "[Tensor::getValuePadded] trying to access out of range");
 504     }
 505 #endif
 506
 507     if (ph <= h && h < ph + height() && pw <= w && w < pw + width()) {
 508       return getValue<T>(b, c, h - ph, w - pw);
 509     }
 510
 511     return pad_value;
 512   }
 513
 514   /**
 515    * @brief     Multiply value element by element immediately
 516    * @param[in] value multiplier
 517    * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
 518    * @retval    #ML_ERROR_NONE Successful
 519    */
 520   int multiply_i(float const &value);
 521
 522   /**
 523    * @brief     Multiply value element by element
 524    * @param[in] value multiplier
 525    * @retval    Calculated Tensor
 526    */
 527   Tensor multiply(float const &value) const;
 528
 529   /**
 530    * @brief     multiply value element by element
 531    * @param[in] value multiplier
 532    * @param[out] out out tensor to store the result
 533    * @retval    Calculated Tensor
 534    */
 535   Tensor &multiply(float const &value, Tensor &out) const;
 536
 537   /**
 538    * @brief     Multiply Tensor Elementwise
 539    * @param[in] m Tensor to be multiplied
 540    * @param[in] beta scalar to multiply output with and add
 541    * @retval    #ML_ERROR_NONE successful
 542    */
 543   int multiply_i(Tensor const &m, const float beta = 0.0);
 544
 545   /**
 546    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 547    * @param[in] m Tensor to be multiplied
 548    * @param[in] beta scalar to multiply output with and add
 549    * @retval    Calculated Tensor
 550    */
 551   Tensor multiply(Tensor const &m, const float beta = 0.0) const;
 552
 553   /**
 554    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 555    * @param[in] m Tensor to be multiplied
 556    * @param[out] output Tensor to store the result
 557    * @param[in] beta scalar to multiply output with and add
 558    * @retval    Calculated Tensor
 559    */
 560   Tensor &multiply(Tensor const &m, Tensor &output,
 561                    const float beta = 0.0) const;
 562
 563   /**
 564    * @brief     Multiply Tensor Elementwise
 565    * @param[in] m Tensor to be multiplied
 566    * @param[in] beta scalar to multiply output with and add
 567    * @retval    #ML_ERROR_NONE successful
 568    *
 569    * @note support different strided inputs and output
 570    * @note does not support broadcasting
 571    *
 572    * @todo merge this to multiply_i
 573    */
 574   int multiply_i_strided(Tensor const &m, const float beta = 0.0);
 575
 576   /**
 577    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 578    * @param[in] m Tensor to be multiplied
 579    * @param[in] beta scalar to multiply output with and add
 580    * @retval    Calculated Tensor
 581    *
 582    * @note support different strided inputs and output
 583    * @note does not support broadcasting
 584    *
 585    * @todo merge this to multiply
 586    */
 587   Tensor multiply_strided(Tensor const &m, const float beta = 0.0) const;
 588
 589   /**
 590    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 591    * @param[in] m Tensor to be multiplied
 592    * @param[out] output Tensor to store the result
 593    * @param[in] beta scalar to multiply output with and add
 594    * @retval    Calculated Tensor
 595    *
 596    * @note support different strided inputs and output
 597    * @note does not support broadcasting
 598    *
 599    * @todo merge this to multiply
 600    */
 601   Tensor &multiply_strided(Tensor const &m, Tensor &output,
 602                            const float beta = 0.0) const;
 603
 604   /**
 605    * @brief     Add Tensor Elementwise
 606    * @param[in] m Tensor to be added
 607    * @param[in] beta scalar to add output with and add
 608    * @retval    #ML_ERROR_NONE successful
 609    *
 610    * @note support different strided inputs and output
 611    * @note does not support broadcasting
 612    *
 613    * @todo merge this to add_i
 614    */
 615   int add_i_strided(Tensor const &m, const float beta = 0.0);
 616
 617   /**
 618    * @brief     Add Tensor Element by Element
 619    * @param[in] m Tensor to be added
 620    * @param[in] beta Value to be scale the added tensor
 621    * @retval    Calculated Tensor
 622    *
 623    * @note support different strided inputs and output
 624    * @note does not support broadcasting
 625    *
 626    * @todo merge this to add
 627    */
 628   Tensor add_strided(Tensor const &m, const float beta = 0.0) const;
 629
 630   /**
 631    * @brief     Add Tensor Element by Element
 632    * @param[in] m Tensor to be added
 633    * @param[out] output Tensor to store the result
 634    * @param[in] beta Value to be scale the added tensor
 635    * @retval    Calculated Tensor
 636    *
 637    * @note support different strided inputs and output
 638    * @note does not support broadcasting
 639    *
 640    * @todo merge this to add
 641    */
 642   Tensor &add_strided(Tensor const &m, Tensor &output,
 643                       const float beta = 0.0) const;
 644
 645   /**
 646    * @brief     Divide value element by element immediately
 647    * @param[in] value divisor
 648    * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
 649    * @retval    #ML_ERROR_NONE Successful
 650    */
 651   int divide_i(float const &value);
 652
 653   /**
 654    * @brief     Divide value element by element
 655    * @param[in] value Divisor
 656    * @retval    Calculated Tensor
 657    */
 658   Tensor divide(float const &value) const;
 659
 660   /**
 661    * @brief     Divide value element by element
 662    * @param[in] value Divisor
 663    * @param[out] out out parameter to store the result
 664    * @retval    Calculated Tensor
 665    */
 666   Tensor &divide(float const &value, Tensor &out) const;
 667
 668   /**
 669    * @brief     divide Tensor Elementwise
 670    * @param[in] m Tensor to be multiplied
 671    * @retval    #ML_ERROR_NONE successful
 672    */
 673   int divide_i(Tensor const &m);
 674
 675   /**
 676    * @brief     Divide Tensor Element by Element
 677    * @param[in] m Divisor Tensor
 678    * @retval    Calculated Tensor
 679    */
 680   Tensor divide(Tensor const &m) const;
 681
 682   /**
 683    * @brief     divide Tensor Elementwise
 684    * @param[in] m Tensor to be multiplied
 685    * @param[out] output Tensor to store the result
 686    * @retval    Calculated Tensor
 687    */
 688   Tensor &divide(Tensor const &m, Tensor &output) const;
 689
 690   /**
 691    * @brief Add Tensor Element immediately to target tensor without mem copy
 692    * @param[in] value value to be added
 693    * @retval #ML_ERROR_NONE  Successful
 694    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 695    */
 696   int add_i(float const &value);
 697
 698   /**
 699    * @brief     Add value Element by Element
 700    * @param[in] value value to be added
 701    * @retval    Calculated Tensor
 702    */
 703   Tensor add(float const &value) const;
 704
 705   /**
 706    * @brief     Add Tensor Element by Element
 707    * @param[in] value value to be added
 708    * @param[out] out Tensor to save output without allocating new memory
 709    * @retval    Calculated Tensor
 710    */
 711   Tensor &add(float const &value, Tensor &out) const;
 712
 713   /**
 714    * @brief Add Tensor Element by Element without mem copy
 715    * @param[in] m Tensor to be added
 716    * @param[out] alpha Values to be scaled
 717    * @retval #ML_ERROR_NONE  Successful
 718    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 719    */
 720   int add_i(Tensor const &m, float const alpha = 1);
 721
 722   /**
 723    * @brief     Add Tensor Element by Element
 724    * @param[in] m Tensor to be added
 725    * @retval    Calculated Tensor
 726    */
 727   Tensor add(Tensor const &m, float const alpha = 1) const;
 728
 729   /**
 730    * @brief     Add Tensor Element by Element
 731    * @param[in] m Tensor to be added
 732    * @param[out] m Tensor to be out
 733    * @retval    Calculated Tensor
 734    */
 735   Tensor &add(Tensor const &m, Tensor &out, float const alpha = 1) const;
 736
 737   /**
 738    * @brief     memcpyless version of subtract
 739    * @param[in] value value to subtract
 740    * @retval #ML_ERROR_NONE  Successful
 741    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 742    */
 743   int subtract_i(float const &value);
 744
 745   /**
 746    * @brief     subtract value Element by Element
 747    * @param[in] value value to be subtracted
 748    * @retval    Calculated Tensor
 749    */
 750   Tensor subtract(float const &value) const;
 751
 752   /**
 753    * @brief     Subtract Tensor Element by Element
 754    * @param[in] value value to be added
 755    * @param[out] out Tensor to save output without allocating new memory
 756    * @retval    Calculated Tensor
 757    */
 758   Tensor &subtract(float const &value, Tensor &out) const;
 759
 760   /**
 761    * @brief     memcpyless version of subtract
 762    * @param[in] m Tensor to be subtracted
 763    * @retval #ML_ERROR_NONE  Successful
 764    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 765    */
 766   int subtract_i(Tensor const &m);
 767
 768   /**
 769    * @brief     Substract Tensor Element by Element
 770    * @param[in] m Tensor to be subtracted
 771    * @retval    Calculated Tensor
 772    */
 773   Tensor subtract(Tensor const &m) const;
 774
 775   /**
 776    * @brief     Subtract Tensor Element by Element
 777    * @param[in] m Tensor to be added
 778    * @param[out] m Tensor to be out
 779    * @retval    Calculated Tensor
 780    */
 781   Tensor &subtract(Tensor const &m, Tensor &out) const;
 782
 783   /**
 784    * @brief Tensor power elementwise
 785    *
 786    * @param exponent exponent
 787    * @return int ML_ERROR_NONE if successful
 788    */
 789   int pow_i(float exponent);
 790
 791   /**
 792    * @brief    Tensor power Element by Element
 793    * @param[in] exponent exponent
 794    * @retval Calculated Tensor
 795    */
 796   Tensor pow(float exponent) const;
 797
 798   /**
 799    * @brief    Tensor power Element by Element
 800    * @param[in] exponent exponent
 801    * @param[out] out out to store the result
 802    * @retval Calculated Tensor
 803    */
 804   Tensor &pow(float exponent, Tensor &out) const;
 805
 806   /**
 807    * @brief  gaussian error function
 808    * @return int ML_ERROR_NONE if successful
 809    */
 810   int erf_i();
 811
 812   /**
 813    * @brief    gaussian error function
 814    * @retval Calculated Tensor
 815    */
 816   Tensor erf() const;
 817
 818   /**
 819    * @brief    gaussian error function
 820    * @param[out] out out to store the result
 821    * @retval Calculated Tensor
 822    */
 823   Tensor &erf(Tensor &out) const;
 824
 825   unsigned int sizeofData() { return dim.getDataTypeSize(); }
 826
 827   /**
 828    * @brief     Dot Product of Tensor ( equal MxM )
 829    * @details   This applies dot of the last dimension of this and second-last
 830    * dimension of passed tensor m.
 831    * @param[in] m Tensor
 832    * @param[in] trans Transpose
 833    * @param[in] trans_m Transpose m
 834    * @retval    Calculated Tensor
 835    */
 836   Tensor dot(Tensor const &m, bool trans = false, bool trans_m = false) const;
 837
 838   /**
 839    * @brief     Dot Product of Tensor ( equal MxM )
 840    * @details   This applies dot of the last dimension of this and second-last
 841    * dimension of passed tensor m.
 842    * @param[in] m Tensor
 843    * @param[in] output output Tensor
 844    * @param[in] trans Transpose
 845    * @param[in] trans_m Transpose m
 846    * @param[in] beta beta
 847    * @retval    Calculated Tensor
 848    */
 849   Tensor &dot(Tensor const &m, Tensor &output, bool trans = false,
 850               bool trans_m = false, float beta = 0.0f) const;
 851
 852   /**
 853    * @brief compute the derivative of this in the current tensor
 854    * @param m same as given to the dot()
 855    * @param output_deriv the derivative of the output
 856    * @param[in] trans same as given to the dot()
 857    * @param[in] trans_m same as given to the dot()
 858    * @param[in] beta same as given to the dot()
 859    * @note This will compute the derivative in-place and will overwrite existing
 860    * data in the tensor
 861    */
 862   Tensor &dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
 863                           bool trans = false, bool trans_m = false,
 864                           float beta = 0.0f);
 865
 866   /**
 867    * @brief compute the derivative wrt m in the m tensor
 868    * @param m_deriv tensor where derivative wrt m will be stored
 869    * @param output_deriv the derivative of the output
 870    * @param[in] trans same as given to the dot()
 871    * @param[in] trans_m same as given to the dot()
 872    * @param[in] beta same as given to the dot()
 873    * @note The caller tensor must be the same tensor as the one which called the
 874    * dot() product.
 875    */
 876   Tensor &dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
 877                           bool trans = false, bool trans_m = false,
 878                           float beta = 0.0f) const;
 879
 880   /**
 881    * @copydoc Tensor::dot(Tensor const &m, Tensor &output, bool trans,
 882               bool trans_m, float beta) const
 883    * @details performs dot operation over a batch of inputs
 884    */
 885   Tensor &dotBatched(Tensor const &m, Tensor &result, bool trans = false,
 886                      bool trans_m = false, float beta = 0.0f) const;
 887
 888   /**
 889    * @copydoc Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const
 890    &output_deriv, bool trans, bool trans_m, float beta)
 891    */
 892   Tensor &dot_batched_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
 893                                   bool trans = false, bool trans_m = false,
 894                                   float beta = 0.0f);
 895
 896   /**
 897    * @brief Tensor::dot_deriv_wrt_2(Tensor const &m_deriv, Tensor const
 898    &output_deriv, bool trans, bool trans_m, float beta) const
 899    */
 900   Tensor &dot_batched_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
 901                                   bool trans = false, bool trans_m = false,
 902                                   float beta = 0.0f) const;
 903
 904   /**
 905    * @brief Transpose Tensor
 906    *
 907    * @param direction to transpose ex) 0:2:1
 908    * @return Tensor
 909    */
 910   Tensor transpose(const std::string &direction) const;
 911
 912   /**
 913    * @brief Transpose Tensor
 914    * @param direction to transpose ex) 0:2:1
 915    * @param[out] Tensor to save to, dimension is always reshaped.
 916    * @retval Tensor& reference to the out
 917    */
 918   Tensor &transpose(const std::string &direction, Tensor &out) const;
 919
 920   /**
 921    * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
 922    * @param dropout drop out rate
 923    * @retval Tensor& reference of drop out mask
 924    */
 925   Tensor dropout_mask(float dropout) const;
 926
 927   /**
 928    * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate) inplace
 929    * @param dropout drop out rate
 930    */
 931   void dropout_mask(float dropout);
 932
 933   /**
 934    * @brief Calculate filter mask
 935    * @param mask_len length of each mask along the last axis
 936    * @param invert invert the mask
 937    */
 938   void filter_mask(const Tensor &mask_len, bool reverse = false);
 939
 940   /**
 941    * @brief Calculate 2 Zone Out Mask
 942    * @details Calculate zone out mask according to the bernoulli distribution.
 943    * Zone out mask with rate @a zoneout for inplace and the other zone out mask
 944    * with rate @a (1-zoneout).
 945    * @param zoneout zone out rate
 946    * @retval Tensor zone out mask for opposite tensor
 947    */
 948   Tensor zoneout_mask(float zoneout);
 949
 950   /**
 951    * @brief Calculate 2 Zone Out Mask
 952    * @details Calculate zone out mask according to the bernoulli distribution.
 953    * Zone out mask with rate @a zoneout for inplace and the other zone out mask
 954    * with rate @a (1-zoneout).
 955    * @param opposite opposite zone out mask
 956    * @param zoneout zone out rate
 957    */
 958   void zoneout_mask(Tensor &opposite, float zoneout);
 959
 960   /**
 961    * @brief     sum all the Tensor elements according to the batch
 962    * @retval    Calculated Tensor(batch, 1, 1, 1)
 963    */
 964   Tensor sum_by_batch() const;
 965
 966   /**
 967    * @brief     sum all the Tensor elements according to the axis
 968    *            0 : batch direction
 969    *            1 : channel direction
 970    *            2 : height direction
 971    *            3 : width direction
 972    * @param[in] axis Axis to calculate sum along
 973    * @param[in] alpha Scale the sum by this value
 974    * @retval    Calculated Tensor
 975    */
 976   Tensor sum(unsigned int axis, float alpha = 1.0) const;
 977
 978   /**
 979    * @brief     sum all the Tensor elements according to the axis
 980    *            0 : batch direction
 981    *            1 : channel direction
 982    *            2 : height direction
 983    *            3 : width direction
 984    * @param[in] axis Axis to calculate sum along
 985    * @param[out] output output tensor
 986    * @param[in] alpha Scale the sum by this value
 987    * @retval    Calculated Tensor
 988    */
 989   Tensor &sum(unsigned int axis, Tensor &output, float alpha = 1.0,
 990               float beta = 0.0) const;
 991
 992   /**
 993    * @brief sum all the Tensor by multiple axes
 994    *
 995    * @param axes axes to sum along
 996    * @param alpha Scale the sum by this value
 997    * @return Tensor
 998    */
 999   Tensor sum(const std::vector<unsigned int> &axes, float alpha = 1.0) const;
1000
1001   /**
1002    * @brief sum all the Tensor by multiple axes
1003    *
1004    * @param axes axes to sum along
1005    * @param[out] output output tensor
1006    * @param alpha Scale the sum by this value
1007    * @return Tensor
1008    */
1009   Tensor &sum(const std::vector<unsigned int> &axes, Tensor &output,
1010               float alpha = 1.0) const;
1011
1012   /**
1013    * @brief     Averaging the Tensor elements according to the axis
1014    *            0 : batch direction
1015    *            1 : channel direction
1016    *            2 : height direction
1017    *            3 : width direction
1018    * @retval    Calculated Tensor
1019    */
1020   Tensor average(unsigned int axis) const;
1021   /**
1022    * @brief     Averaging the Tensor elements according to the axis
1023    *
1024    * @retval    Calculated Tensor
1025    */
1026   Tensor &average(unsigned int axis, Tensor &output) const;
1027
1028   /**
1029    * @brief average all the Tensor by multiple axes
1030    *
1031    * @param axes axes to sum along
1032    * @return Tensor
1033    */
1034   Tensor average(const std::vector<unsigned int> &axes) const;
1035
1036   /**
1037    * @brief average all the Tensor by multiple axes
1038    *
1039    * @param axes axes to sum along
1040    * @param output output tensor
1041    * @return Tensor
1042    */
1043   Tensor &average(const std::vector<unsigned int> &axes, Tensor &output) const;
1044
1045   /**
1046    * @brief     Averaging the Tensor elements by all axis
1047    * @retval    Calculated Tensor
1048    */
1049   Tensor average() const;
1050
1051   /**
1052    * @brief     Averaging the Tensor elements by all axis
1053    * @retval    Calculated Tensor
1054    */
1055   Tensor &average(Tensor &output) const;
1056
1057   /**
1058    * @brief     Anchor a starting point to defer following evaluation
1059    * @retval    LazyTensor class that can be used with run();
1060    */
1061   LazyTensor chain() const;
1062
1063   /**
1064    * @brief     Softmax the Tensor elements
1065    * @retval    Calculated Tensor
1066    */
1067   Tensor softmax() const;
1068
1069   /**
1070    * @brief     l2norm the Tensor elements
1071    * @retval    Calculated l2norm
1072    */
1073   float l2norm() const;
1074
1075   /**
1076    * @brief     Normalize the Tensor elements
1077    * @retval    Calculated Tensor
1078    */
1079   Tensor &normalization(Tensor &output) const;
1080
1081   /**
1082    * @brief     Standardize the Tensor elements
1083    * @retval    Calculated Tensor
1084    */
1085   Tensor &standardization(Tensor &output) const;
1086
1087   /**
1088    * @brief     Normalize the Tensor elements in-place
1089    * @retval    Calculated Tensor
1090    */
1091   void normalization_i();
1092
1093   /**
1094    * @brief     Standardize the Tensor elements in-place
1095    * @retval    Calculated Tensor
1096    */
1097   void standardization_i();
1098
1099   template <typename T = float> T *getAddress(unsigned int i) {
1100     size_t index = getIndex(batch(), channel(), height(), width());
1101     if (i > index) {
1102       return nullptr;
1103     }
1104     return &getData<T>()[i];
1105   }
1106
1107   /**
1108    * @brief     i data index
1109    * @retval    address of ith data
1110    */
1111   template <typename T = float> const T *getAddress(unsigned int i) const {
1112     size_t index = getIndex(batch(), channel(), height(), width());
1113     if (i > index) {
1114       return nullptr;
1115     }
1116
1117     return &getData<T>()[i];
1118   }
1119
1120   /**
1121    * @brief    get address of n-d data
1122    */
1123   template <typename T = float>
1124   T *getAddress(unsigned int b, unsigned int c, unsigned int h,
1125                 unsigned int w) {
1126     return getAddress<T>(getIndex(b, c, h, w));
1127   }
1128
1129   /**
1130    * @brief    get address of n-d data
1131    */
1132   template <typename T = float>
1133   const T *getAddress(unsigned int b, unsigned int c, unsigned int h,
1134                       unsigned int w) const {
1135     return getAddress<T>(getIndex(b, c, h, w));
1136   }
1137
1138   /**
1139    * @brief Apply instantly to the element
1140    *
1141    * @param f function to apply
1142    * @return int ML_ERROR_NONE if successful
1143    */
1144   int apply_i(std::function<float(float)> f) {
1145     Tensor result = *this;
1146     apply(f, result);
1147
1148     return ML_ERROR_NONE;
1149   };
1150
1151   /**
1152    * @brief     Apply function element by element
1153    * @param[in] *function function pointer applied
1154    * @retval    Tensor
1155    */
1156   Tensor apply(std::function<float(float)> f) const {
1157     Tensor result;
1158     return apply(f, result);
1159   };
1160
1161   /**
1162    * @brief     Apply function element by element
1163    * @param[in] *function function pointer applied
1164    * @param[out] output output tensor
1165    * @retval    Tensor
1166    */
1167   Tensor &apply(std::function<float(float)> f, Tensor &output) const {
1168     CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
1169
1170     if (dim != output.dim) {
1171       /// @todo add unittest
1172       throw std::invalid_argument(
1173         "[Tensor::apply] output dimension does not match");
1174     }
1175
1176     if (dim.getDataType() == Tdatatype::FP32) {
1177       if (contiguous && output.contiguous) {
1178         const float *data = (getData<float>());
1179         float *rdata = (output.getData<float>());
1180
1181         std::transform(data, data + size(), rdata, f);
1182       } else if (strides[3] == 1 && output.strides[3] == 1) {
1183         /** @todo optimize this with combining these loops where stride is 1 */
1184         for (unsigned int b = 0; b < batch(); ++b) {
1185           for (unsigned int c = 0; c < channel(); ++c) {
1186             for (unsigned int h = 0; h < height(); ++h) {
1187               float *out_data = output.getAddress<float>(b, c, h, 0);
1188               const float *in_data = getAddress<float>(b, c, h, 0);
1189               std::transform(in_data, in_data + width(), out_data, f);
1190             }
1191           }
1192         }
1193       } else {
1194         for (unsigned int b = 0; b < batch(); ++b) {
1195           for (unsigned int c = 0; c < channel(); ++c) {
1196             for (unsigned int h = 0; h < height(); ++h) {
1197               for (unsigned int w = 0; w < width(); ++w) {
1198                 output.setValue(b, c, h, w, f(getValue<float>(b, c, h, w)));
1199               }
1200             }
1201           }
1202         }
1203       }
1204     } else if (dim.getDataType() == Tdatatype::FP16) {
1205       if (contiguous && output.contiguous) {
1206         const __fp16 *data = (getData<__fp16>());
1207         __fp16 *rdata = (output.getData<__fp16>());
1208
1209         std::transform(data, data + size(), rdata, f);
1210       } else if (strides[3] == 1 && output.strides[3] == 1) {
1211         /** @todo optimize this with combining these loops where stride is 1 */
1212         for (unsigned int b = 0; b < batch(); ++b) {
1213           for (unsigned int c = 0; c < channel(); ++c) {
1214             for (unsigned int h = 0; h < height(); ++h) {
1215               __fp16 *out_data = (__fp16 *)output.getAddress(b, c, h, 0);
1216               const __fp16 *in_data = (__fp16 *)getAddress(b, c, h, 0);
1217               std::transform(in_data, in_data + width(), out_data, f);
1218             }
1219           }
1220         }
1221       } else {
1222         for (unsigned int b = 0; b < batch(); ++b) {
1223           for (unsigned int c = 0; c < channel(); ++c) {
1224             for (unsigned int h = 0; h < height(); ++h) {
1225               for (unsigned int w = 0; w < width(); ++w) {
1226                 output.setValue(b, c, h, w,
1227                                 f((float)((__fp16)getValue(b, c, h, w))));
1228               }
1229             }
1230           }
1231         }
1232       }
1233     }
1234
1235     return output;
1236   };
1237
1238   /**
1239    * @brief     Apply function to Tensor
1240    * @param[in] *function function pointer applied
1241    * @retval    Tensor
1242    */
1243   Tensor apply(std::function<Tensor(Tensor)> f) const;
1244
1245   /**
1246    * @brief     Apply function to Tensor
1247    * @param[in] *function function pointer applied
1248    * @param[out] output output tensor
1249    * @retval    Tensor
1250    */
1251   Tensor &apply(std::function<Tensor &(Tensor, Tensor &)> f,
1252                 Tensor &output) const;
1253
1254   /**
1255    * @brief     Print element
1256    * @param[in] out out stream
1257    * @retval    Tensor
1258    */
1259   void print(std::ostream &out) const;
1260
1261   /**
1262    * @brief     Print element
1263    * @param[in] out out stream
1264    * @param[in] opt print formatting option. opt=0 would pretty print the data,
1265    * else it would print the raw data.
1266    * @retval    Tensor
1267    */
1268   void print_(std::ostream &out, uint opt = 0) const;
1269
1270   /**
1271    * @brief     Get size of current tensor
1272    * @retval    unsigned int size of the current tensor
1273    */
1274   size_t size() const { return dim.getDataLen(); }
1275
1276   /**
1277    * @brief     Get if the tensor is empty
1278    * @retval    true if the tensor is empty
1279    */
1280   bool empty() const { return size() == 0; }
1281
1282   /**
1283    * @brief     Get size of the data in bytes
1284    * @retval    size_t Size in bytes
1285    */
1286   size_t bytes() const { return size() * dim.getDataTypeSize(); }
1287
1288   /**
1289    * @brief     Set the element value
1290    * @param[in] batch batch location
1291    * @param[in] c channel location
1292    * @param[in] h height location
1293    * @param[in] w width location
1294    * @param[in] value value to be stored
1295    */
1296   void setValue(unsigned int batch, unsigned int c, unsigned int h,
1297                 unsigned int w, float value) noexcept {
1298     if (getDataType() == Tdatatype::FP32) {
1299       getData<float>()[getIndex(batch, c, h, w)] = value;
1300     } else if (getDataType() == Tdatatype::FP16) {
1301       getData<__fp16>()[getIndex(batch, c, h, w)] = value;
1302     }
1303   }
1304
1305   /**
1306    * @brief     add the element value to the location
1307    * @param[in] batch batch location
1308    * @param[in] c channel location
1309    * @param[in] h height location
1310    * @param[in] w width location
1311    * @param[in] value value to be stored
1312    * @param[in] beta scalar to multiply output with and add
1313    */
1314   void addValue(unsigned int batch, unsigned int c, unsigned int h,
1315                 unsigned int w, float value, float beta) noexcept {
1316     auto const &idx = getIndex(batch, c, h, w);
1317     if (dim.getDataType() == Tdatatype::FP32) {
1318       getData<float>()[idx] *= beta;
1319       getData<float>()[idx] += value;
1320     } else if (dim.getDataType() == Tdatatype::FP16) {
1321       getData<__fp16>()[idx] *= beta;
1322       getData<__fp16>()[idx] += value;
1323     }
1324   }
1325
1326   /**
1327    * @brief     Set the element value
1328    * @param[in] offset offset from start location
1329    * @param[in] value value to be stored
1330    *
1331    * @todo      This is a temporary workout. Remove this once multiple datatypes
1332    * are supported.
1333    */
1334   void setValueInt(unsigned int offset, int value) noexcept {
1335     int *data_int = (int *)getData();
1336     data_int[offset] = value;
1337   }
1338
1339   /**
1340    * @brief     Fill the Tensor elements with value
1341    * @param[in] value value to be stored
1342    */
1343   void setValue(float value);
1344
1345   /**
1346    * @brief     Fill the Tensor elements with zero
1347    */
1348   void setZero();
1349
1350   /**
1351    * @brief Set the Dist object
1352    *
1353    * @tparam T distrubution engine
1354    * @param dist distribution engine
1355    */
1356   template <typename T, typename Engine> void setDist(Engine dist) {
1357     NNTR_THROW_IF(!contiguous, std::invalid_argument)
1358       << getName() << " Tensor is not contiguous, cannot set distribution";
1359
1360     T *data_ = getData<T>();
1361     unsigned int len = size();
1362     for (unsigned int i = 0; i < len; ++i) {
1363       data_[i] = (T)dist(rng);
1364     }
1365   };
1366
1367   /**
1368    * @brief     Set the tensor with random normal distribution
1369    * @param[in] mean mean of the distribution
1370    * @param[in] std standard deviation of the distribution
1371    */
1372   void setRandNormal(float mean = 0.0f, float std = 0.05f);
1373
1374   /**
1375    * @brief     Set the tensor with random uniform distribution
1376    * @param[in] min minimum value for the distribution
1377    * @param[in] max maximum value for the distribution
1378    */
1379   void setRandUniform(float min = -0.05f, float max = 0.05f);
1380
1381   /**
1382    * @brief     Set the tensor with random bernoulli distribution
1383    * @param[in] probability probability value for the distribution
1384    */
1385   void setRandBernoulli(float probability = 0.5f);
1386
1387   /**
1388    * @brief     Initialize the memory of the given tensor
1389    */
1390   void initialize();
1391
1392   /**
1393    * @brief     Initialize the memory of the given tensor
1394    * @param     init Initiailizer to use for the initialization
1395    */
1396   void initialize(Initializer init) {
1397     initializer = init;
1398     initialize();
1399   }
1400
1401   /**
1402    * @brief     set the memory format
1403    * @param     fm format of Tensor
1404    */
1405   void convertFormat(TensorDim::Format fm) {
1406     if (getFormat() != fm) {
1407       transpose("2:1:0");
1408     }
1409
1410     dim.setFormat(fm);
1411   }
1412
1413   /**
1414    * @brief     Copy the Tensor
1415    * @param[in] from Tensor to be copied
1416    *
1417    * @note copy can reshape the tensor to match the shape
1418    */
1419   void copy(const Tensor &from);
1420
1421   /**
1422    * @brief     Copy the Tensor
1423    * @param[in] from Tensor to be copied
1424    */
1425   void copyData(const Tensor &from);
1426
1427   /**
1428    * @brief     Copy the Tensor
1429    * @param[in] from Tensor to be copied
1430    */
1431   void copy_with_stride(const Tensor &from);
1432
1433   /**
1434    * @brief Get slice of the tensor, sliced by batch
1435    * @param[in] offset offset in batch to start the slice
1436    * @param[in] size size of the slice
1437    * @retval slice of this tensor
1438    * @note This function provides a slice of this tensor, and does not create a
1439    * copy
1440    */
1441   Tensor getBatchSlice(size_t offset, unsigned int size) const;
1442
1443   /**
1444    * @brief Get new tensor which shares memory with current tensor but different
1445    * shape
1446    *
1447    * @param dim new dimension to be set for this tensor
1448    * @param offset offset to be used from the start of the data in elements
1449    * @note The new tensor will share the same data as the current tensor but
1450    * can have different size.
1451    * @note New size added with offset must be less than the size of the original
1452    * tensor.
1453    */
1454   Tensor getSharedDataTensor(const TensorDim dim, size_t offset,
1455                              bool reset_stride = true,
1456                              const std::string &name_ = "") const;
1457   /**
1458    * @brief split tensor along axis.
1459    *
1460    * @param num_size num_size
1461    * @param axis axis
1462    * @return Tensor splitted tensor
1463    */
1464   std::vector<Tensor> split(unsigned num_size, int axis = 0);
1465
1466   /**
1467    * @brief split tensor along axis.
1468    *
1469    * @param sizes sizes
1470    * @param axis axis
1471    * @return Tensor splitted tensor
1472    * @note if the given array sizes is just a 1 unsigned int value, assumes that
1473    * it divide tensor by given size evenly
1474    */
1475   std::vector<Tensor> split(std::vector<size_t> sizes, int axis = 0);
1476
1477   /**
1478    * @brief concatenate tensors along axis
1479    *
1480    * @param tensors tensors to be concatenated to the first tensor
1481    * @param axis axis
1482    * @return Tensor concatenated tensor
1483    */
1484   static Tensor cat(const std::vector<Tensor> &tensors, int axis = 0);
1485
1486   /**
1487    * @brief make this tensor share memory with given tensor
1488    *
1489    * @param src Source tensor whose memory is to be shared
1490    * @param offset offset to be used from the start of the data in bytes
1491    * @note This tensor will share the same data as the current tensor but
1492    * can have different size.
1493    * @note This tensor's size added with offset must be less than the size of
1494    * the source tensor.
1495    * @note The stride of the source tensor and this tensor must be same.
1496    */
1497   void makeSharedDataTensor(const Tensor &src, size_t offset = 0);
1498
1499   /**
1500    * @brief     Convient wrapper for inplace copy of @a this.
1501    * @retval    Copied version of this
1502    */
1503   Tensor clone() const;
1504
1505   /**
1506    * @brief     Save the Tensor into file
1507    * @param[in] file output file stream
1508    */
1509   void save(std::ostream &file);
1510
1511   /**
1512    * @brief     Read the Tensor from file
1513    * @param[in] file input file stream
1514    */
1515   void read(std::ifstream &file);
1516
1517   /**
1518    * @brief     return argument index which value is max by batch
1519    * @retval    unsigned int argument index
1520    */
1521   std::vector<unsigned int> argmax() const;
1522
1523   /**
1524    * @brief     return max of the absolute values of the tensor
1525    * @retval    maximum absolute value
1526    */
1527   float max_abs() const;
1528
1529   /**
1530    * @brief     return a copy of the Tensor Dim
1531    * @retval    TensorDim
1532    */
1533   TensorDim getDim() const { return TensorDim(dim); }
1534
1535   /**
1536    * @brief     return Tensor Dim for a given axis
1537    * @retval    dimension
1538    */
1539   size_t getTensorDim(unsigned int axis);
1540
1541   /**
1542    * @brief     return Tensor Type
1543    */
1544   TensorDim::TensorType getTensorType() const { return dim.getTensorType(); };
1545
1546   /**
1547    * @brief     return Tensor batch size
1548    * @retval    batch size
1549    */
1550   size_t batch() const { return dim.batch(); }
1551
1552   /**
1553    * @brief     return Tensor batch size
1554    * @retval    batch size
1555    */
1556   size_t channel() const { return dim.channel(); }
1557
1558   /**
1559    * @brief     return Tensor height size
1560    * @retval    height size
1561    */
1562   size_t height() const { return dim.height(); }
1563
1564   /**
1565    * @brief     return Tensor batch size
1566    * @retval    width size
1567    */
1568   size_t width() const { return dim.width(); }
1569
1570   /**
1571    * @brief     return Tensor Data Type Size
1572    * @retval    data type size
1573    */
1574   uint getDataTypeSize() const { return dim.getDataTypeSize(); }
1575
1576   /**
1577    * @brief     update batch size for this tensor
1578    * @param     batch size
1579    * @note      The batchsize of src_tensor need not be related with this
1580    * tensor's batch size
1581    *
1582    * @note      The memory for this tensor will re-allocated/re-assigned if the
1583    * updated batch size is different than the current batch size.
1584    *
1585    * @note      If this tensor is/was the src_tensor for some other, then
1586    * reduction in batch size can make the dependent tensors allocate fail due to
1587    * memory smaller. Caller must handle this in their own end.
1588    *
1589    * @note      If this tensor is re-allocated, then the memory might not be
1590    * immediately freed as the tensor already depending on this tensor also
1591    * share the same memory. So, the peak memory consumption in worst case can
1592    * reach the total memory requirements of a model with old batchsize and the
1593    * new batch size. It is recommended to first deallocate all the tensors,
1594    * updateBatch and then allocate again to avoid such issues.
1595    */
1596   void updateBatch(unsigned int batch) {
1597     if (dim.batch() == batch) {
1598       return;
1599     }
1600
1601     if (isAllocated())
1602       throw std::invalid_argument(
1603         "Cannot update batch for an allocated tensor");
1604     dim.batch(batch);
1605   }
1606
1607   /**
1608    * @brief     return Data pointer of Tensor
1609    * @retval    template T pointer (float pointer as default)
1610    */
1611   template <typename T = float> T *getData() {
1612     if (!data)
1613       return nullptr;
1614
1615     data->validate();
1616     return (T *)((data->getAddr<T>()) + offset);
1617   }
1618
1619   /**
1620    * @brief     return Data pointer of Tensor
1621    * @retval    template T pointer (float pointer as default)
1622    */
1623   template <typename T = float> const T *getData() const {
1624     if (!data)
1625       return nullptr;
1626
1627     data->validate();
1628     return (T *)(data->getAddr<T>() + offset);
1629   }
1630
1631   /**
1632    * @brief     return Data pointer of Tensor
1633    * @retval    template T pointer (float pointer as default)
1634    */
1635   template <typename T = float> T *getData(size_t idx) const {
1636     if (!data)
1637       return nullptr;
1638
1639     size_t index = idx * sizeof(T);
1640
1641     data->validate();
1642     return (T *)(data->getAddr<T>() + offset + index);
1643   }
1644
1645   void setDataType(Tdatatype d_type) { dim.setDataType(d_type); }
1646
1647   void setTensorType(ml::train::TensorDim::TensorType t_type) {
1648     dim.setTensorType(t_type);
1649   }
1650
1651   /**
1652    * @brief     put data of Tensor
1653    *
1654    * @note      It is only effective when memory_swap is used
1655    */
1656   void putData() const {
1657     if (!data)
1658       return;
1659
1660     data->invalidate();
1661   }
1662
1663   /**
1664    * @brief     return Data pointer of Tensor
1665    * @retval    template T pointer (float pointer as default)
1666    */
1667   const std::shared_ptr<MemoryData> getMemoryData() const { return data; }
1668
1669   /**
1670    * @brief     return offset
1671    */
1672   unsigned int getOffset() const { return offset; }
1673
1674   /**
1675    * @brief     i data index
1676    * @retval    address of ith data
1677    */
1678   /**
1679    * @brief     set Tensor Dim
1680    * @param[in] d TensorDim
1681    * @note      Throws std::invalid_argument if size mismatch
1682    */
1683   void reshape(const TensorDim &d);
1684
1685   /**
1686    * @brief fill tensor data with current value,
1687    * if dimension is not exactly same, it is a hard error in this function
1688    * so, only stride is overriden to @a this
1689    *
1690    * @param from Tensor to fill the data from
1691    * @param allocate if unallocated, allocate with from.getDim()
1692    * @throws std::invalid_argument if dimension and stride does not match
1693    */
1694   void fill(const Tensor &from, bool allocate = false);
1695
1696   /**
1697    * @brief     return current stride of tensor.
1698    * @retval    int[MAXDIM] strides
1699    */
1700   const std::array<size_t, TensorDim::MAXDIM> getStrides() const noexcept {
1701     return strides;
1702   }
1703   /**
1704    * @brief Get linear index given the n-d index
1705    */
1706   inline size_t getIndex(unsigned int b, unsigned int c, unsigned int h,
1707                          unsigned int w) const noexcept {
1708     if (getFormat() == Tformat::NCHW) {
1709       return (b * strides[0] + c * strides[1] + h * strides[2] +
1710               w * strides[3]);
1711     } else {
1712       return (b * strides[0] + h * strides[1] + w * strides[2] +
1713               c * strides[3]);
1714     }
1715   }
1716
1717   /**
1718    * @brief Check if two given axes are contiguous
1719    */
1720   bool checkContinuous(unsigned int n, unsigned int np1) const {
1721     std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
1722     bool continuous = false;
1723     if (getFormat() == Tformat::NHWC) {
1724       if (continuous_order_nhwc[np1] == continuous_order_nhwc[n] + 1)
1725         continuous = true;
1726     } else {
1727       if (n + 1 == np1)
1728         continuous = true;
1729     }
1730     return continuous;
1731   }
1732
1733   /**
1734    * @brief   Get name of the tensor
1735    *
1736    * @return name of the tensor
1737    */
1738   void setName(const std::string &name_) { name = name_; }
1739
1740   /**
1741    * @brief   Get name of the tensor
1742    *
1743    * @return name of the tensor
1744    */
1745   const std::string &getName() const { return name; }
1746
1747   /**
1748    * @brief Set the memory buffer for the tensor
1749    *
1750    * @param buf the memory buffer
1751    * @param init intialize the buffer
1752    */
1753   void setData(const std::shared_ptr<MemoryData> buf, unsigned int off = 0,
1754                bool init = false) {
1755     if (buf) {
1756       data = buf;
1757       offset = off;
1758       if (init)
1759         initialize();
1760     } else {
1761       data = nullptr;
1762       offset = 0;
1763     }
1764   }
1765
1766   /**
1767    * @brief Get initializer for the tensor
1768    *
1769    * @return initializer of the tensor
1770    */
1771   Tensor::Initializer getInitializer() const { return initializer; }
1772
1773   /**
1774    * @brief Get format for the tensor
1775    *
1776    * @return format of the tensor
1777    */
1778   TensorDim::Format getFormat() const { return dim.getFormat(); }
1779
1780   /**
1781    * @brief Get data type for the tensor
1782    *
1783    * @return data type of the tensor
1784    */
1785   Tdatatype getDataType() const { return dim.getDataType(); }
1786
1787   static constexpr float epsilon = 1e-5;
1788
1789 private:
1790   /**< handle the data as a std::shared_ptr<float> type */
1791   TensorDim dim;
1792   std::array<size_t, TensorDim::MAXDIM> strides;
1793   bool contiguous;
1794   Tensor::Initializer initializer;
1795   std::string name; /**< name of the tensor */
1796   std::shared_ptr<MemoryData> data;
1797   unsigned int offset;
1798
1799   /**<
1800    * When using shared_data with tensor, this stores the ptr of the source
1801    * tensor which handles the full memory. If tensor data is already allocated,
1802    * this does not affect the tensor. If the tensor data is not allocated, and
1803    * src_ptr is valid, this tensor will use the memory allocated by the src_ptr
1804    */
1805   std::shared_ptr<SrcSharedTensor> src_tensor;
1806
1807   struct BroadcastInfo;
1808
1809   /**
1810    * @brief Applies the given operator to the tensor with the passed argument
1811    * @param[in] m Tensor
1812    * @param[in] v_func vectorized function to apply
1813    * @param e broadcast info.
1814    * @param cur_axis current axis. pass default when calling outside.
1815    * @param offset offset for this.  pass default when calling outside.
1816    * @param m_offset offset for m.  pass default when calling outside.
1817    * @retval #ML_ERROR_NONE Successful
1818    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
1819    */
1820   void
1821   apply_broadcast_util(Tensor const &m,
1822                        std::function<void(const BroadcastInfo &e, const float *,
1823                                           const float *, float *)>
1824                          v_func,
1825                        Tensor &output, const BroadcastInfo &e,
1826                        int cur_axis = -1, size_t offset = 0,
1827                        size_t m_offset = 0) const;
1828
1829   void apply_broadcast_util(
1830     Tensor const &m,
1831     std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1832                        __fp16 *)>
1833       v_func,
1834     Tensor &output, const BroadcastInfo &e, int cur_axis = -1,
1835     size_t offset = 0, size_t m_offset = 0) const;
1836
1837   /**
1838    * @brief Applies the given operator to the tensor with the passed argument
1839    *
1840    * @param[in] m Tensor
1841    * @param[in] v_func vectorized function to apply
1842    * @retval #ML_ERROR_NONE Successful
1843    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
1844    */
1845   void apply_broadcast(Tensor const &m,
1846                        std::function<void(const BroadcastInfo &e, const float *,
1847                                           const float *, float *)>
1848                          v_func,
1849                        Tensor &output) const;
1850
1851   void
1852   apply_broadcast(Tensor const &m,
1853                   std::function<void(const BroadcastInfo &e, const __fp16 *,
1854                                      const __fp16 *, __fp16 *)>
1855                     v_func,
1856                   Tensor &output) const;
1857
1858   /**
1859    * @brief compute Loop info for broadcasting and vectorization
1860    *
1861    * @param m target tensor to be calculated against.
1862    * @return BroadcastInfo Loopinfo needed to run external loop
1863    */
1864   BroadcastInfo computeBroadcastInfo(const Tensor &m) const;
1865
1866   /**
1867    * @brief copy a buffer to @a this, the caller has to ensure that @a this is
1868    * initialized otherwise undefined behavior
1869    *
1870    * @param buf buffer to copy from
1871    */
1872   void copy(const void *buf);
1873
1874   /**
1875    * @brief Update destination tensor to share memory with source tensor
1876    *
1877    * @param src src tensor containing the memory
1878    * @param dest destination tensor which will share the memory
1879    * @param offset offset to be used from the start of the data in bytes
1880    * @note The new tensor will share the same data as the current tensor but
1881    * can have different size.
1882    * @note New size added with offset must be less than the size of the original
1883    * tensor.
1884    */
1885   static void createSharedDataTensor(const Tensor &src, Tensor &dest,
1886                                      size_t offset);
1887
1888   /**
1889    * @brief    Reallocate memory for this tensor
1890    * @note     This will not necessary free the memory as tensors share memory
1891    * @note     This can increase the peak memory consumption when callled on all
1892    * the tensors of a model sequentially. It is advised to first deallocate all
1893    * the tensors and then allocate, than reallocate tensors one by one.
1894    */
1895   void reallocate() {
1896     deallocate();
1897     allocate();
1898   }
1899
1900   /**
1901    * @brief Merge the given two axis for tensor at second axis inplace
1902    *
1903    * @param axis1 first axis to merge
1904    * @param axis2 second axis to merge
1905    */
1906   void mergeAxis(unsigned int axis1, unsigned int axis2);
1907
1908   /**
1909    * @brief     rotate 180 dgree
1910    * @param[in] in input Tensor
1911    * @retVal Tensor rotated tensor (180 degree)
1912    */
1913   Tensor rotate_180(Tensor in);
1914
1915 }; // namespace nntrainer
1916
1917 /**
1918  * @brief   Overriding output stream
1919  */
1920 std::ostream &operator<<(std::ostream &out, Tensor const &m);
1921
1922 typedef std::shared_ptr<Tensor> sharedTensor;
1923
1924 typedef std::shared_ptr<const Tensor> sharedConstTensor;
1925
1926 typedef std::vector<sharedConstTensor> sharedConstTensors;
1927
1928 typedef std::vector<sharedTensor> sharedTensors;
1929
1930 } /* namespace nntrainer */
1931
1932 #endif /* __cplusplus */
1933 #endif /* __TENSOR_H__ */