nntrainer/tensor/tensor.h

   1 /**
   2  * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *   http://www.apache.org/licenses/LICENSE-2.0
   8  * Unless required by applicable law or agreed to in writing, software
   9  * distributed under the License is distributed on an "AS IS" BASIS,
  10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11  * See the License for the specific language governing permissions and
  12  * limitations under the License.
  13  *
  14  *
  15  * @file        tensor.h
  16  * @date        04 December 2019
  17  * @brief       This is Tensor class for calculation
  18  * @see         https://github.com/nnstreamer/nntrainer
  19  * @author      Jijoong Moon <jijoong.moon@samsung.com>
  20  * @bug         No known bugs except for NYI items
  21  *
  22  * @todo deprecate new tensor allocation for out of place operations.
  23  */
  24
  25 #ifndef __TENSOR_H__
  26 #define __TENSOR_H__
  27 #ifdef __cplusplus
  28
  29 #include <array>
  30 #include <functional>
  31 #include <memory>
  32 #include <random>
  33 #include <stdexcept>
  34 #include <vector>
  35
  36 #include <blas_interface.h>
  37 #include <iostream>
  38 #include <memory_data.h>
  39 #include <nntrainer_error.h>
  40 #include <nntrainer_log.h>
  41 #include <tensor_dim.h>
  42 #include <util_func.h>
  43
  44 #ifdef DEBUG
  45 #define EXCEPT_WHEN_DEBUG
  46 #else
  47 #define EXCEPT_WHEN_DEBUG noexcept
  48 #endif
  49
  50 #define MAKE_SHARED_TENSOR(...) std::make_shared<nntrainer::Tensor>(__VA_ARGS__)
  51
  52 #define CREATE_IF_EMPTY_DIMS(tensor, ...) \
  53   do {                                    \
  54     if (tensor.empty())                   \
  55       tensor = Tensor(__VA_ARGS__);       \
  56   } while (0);
  57
  58 namespace nntrainer {
  59
  60 using TensorDim = ml::train::TensorDim;
  61 using Tformat = ml::train::TensorDim::Format;
  62 using Tdatatype = ml::train::TensorDim::DataType;
  63
  64 class LazyTensor;
  65 class SrcSharedTensor;
  66
  67 /**
  68  * @class   Tensor Class for Calculation
  69  * @brief   Tensor Class for Calculation
  70  */
  71 class Tensor {
  72 public:
  73   /**
  74    * @brief     Enumeration of Weight Initialization Type
  75    * @todo      support intialization from file
  76    */
  77   enum class Initializer {
  78     ZEROS,          /** Zero initialization */
  79     ONES,           /** One initialization */
  80     LECUN_NORMAL,   /** LeCun normal initialization */
  81     LECUN_UNIFORM,  /** uniform initialization */
  82     XAVIER_NORMAL,  /** Xavier normal initialization */
  83     XAVIER_UNIFORM, /** Xavier uniform initialization */
  84     HE_NORMAL,      /** He normal initialization */
  85     HE_UNIFORM,     /** He uniform initialization */
  86     NONE            /** No initialization */
  87   };
  88
  89   /**
  90    * @brief     Basic Constructor of Tensor
  91    */
  92   Tensor(std::string name_ = "", Tformat fm = Tformat::NCHW,
  93          Tdatatype d_type = Tdatatype::FP32) :
  94     dim(TensorDim(fm, d_type)),
  95     strides(dim.computeStrides()),
  96     contiguous(true),
  97     initializer(Initializer::NONE),
  98     name(name_),
  99     data(nullptr),
 100     offset(0),
 101     src_tensor() {}
 102
 103   /**
 104    * @brief     Constructor of Tensor with dimension, possibly lazily
 105    * @param d Tensor dim for this tensor
 106    * @param alloc_now If the memory of the tensor must be allocated
 107    * @param init Initializer for the tensor
 108    * @param name Name of the tensor
 109    */
 110   Tensor(const TensorDim &d, bool alloc_now,
 111          Initializer init = Initializer::NONE, std::string name = "");
 112
 113   /**
 114    * @brief     Constructor of Tensor with dimension/buf
 115    * @param d Tensor dim for this tensor
 116    * @param buf buffer
 117    * @note Memory for this tensor is instantaneously allocated
 118    */
 119   Tensor(const TensorDim &d, const void *buf = nullptr);
 120
 121   /**
 122    * @brief     Constructor of Tensor
 123    * @param[in] d0 Batch of Tensor
 124    * @param[in] d1 Channel
 125    * @param[in] d2 Height
 126    * @param[in] d3 Width
 127    */
 128   Tensor(size_t d0, size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 129          Tdatatype d_type = Tdatatype::FP32) :
 130     Tensor(TensorDim(d0, d1, d2, d3, fm, d_type), nullptr){};
 131
 132   /**
 133    * @brief     Constructor of Tensor
 134    * @param[in] d1 Channel
 135    * @param[in] d2 Height
 136    * @param[in] d3 Width
 137    */
 138   Tensor(size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 139          Tdatatype d_type = Tdatatype::FP32) :
 140     Tensor(1, d1, d2, d3, fm, d_type){};
 141
 142   /**
 143    * @brief     Constructor of Tensor with batch size one and d1 size one
 144    * @param[in] d2 Height (NCHW) or Width (NHWC)
 145    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 146    */
 147   Tensor(size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
 148          Tdatatype d_type = Tdatatype::FP32) :
 149     Tensor(1, 1, d2, d3, fm, d_type){};
 150
 151   /**
 152    * @brief     Constructor of Tensor with just Width or Channel
 153    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 154    */
 155   explicit Tensor(size_t d3, Tformat fm = Tformat::NCHW,
 156                   Tdatatype d_type = Tdatatype::FP32) :
 157     Tensor(1, 1, 1, d3, fm, d_type){};
 158
 159   /**
 160    * @brief     Constructor of Tensor
 161    * @param[in] d0 Batch of Tensor
 162    * @param[in] d1 Channel (NCHW) or Height (NHWC)
 163    * @param[in] d2 Height (NCHW) or Width (NHWC)
 164    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 165    */
 166   Tensor(size_t d0, size_t d1, size_t d2, size_t d3,
 167          ml::train::TensorDim::TensorType t_type) :
 168     Tensor(TensorDim(d0, d1, d2, d3, t_type), nullptr){};
 169
 170   /**
 171    * @brief     Constructor of Tensor
 172    * @param[in] d1 Channel
 173    * @param[in] d2 Height
 174    * @param[in] d3 Width
 175    */
 176   Tensor(size_t d1, size_t d2, size_t d3,
 177          ml::train::TensorDim::TensorType t_type) :
 178     Tensor(1, d1, d2, d3, t_type){};
 179
 180   /**
 181    * @brief     Constructor of Tensor with batch size one and d1 size one
 182    * @param[in] d2 Height (NCHW) or Width (NHWC)
 183    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 184    */
 185   Tensor(size_t d2, size_t d3, ml::train::TensorDim::TensorType t_type) :
 186     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3,
 187            (t_type.format == Tformat::NCHW) ? d2 : 1,
 188            (t_type.format == Tformat::NCHW) ? d3 : d2, t_type){};
 189   /**
 190    * @brief     Constructor of Tensor with just Width or Channel
 191    * @param[in] d3 Width (NCHW) or Channel (NHWC)
 192    */
 193   explicit Tensor(size_t d3, ml::train::TensorDim::TensorType t_type) :
 194     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3, 1,
 195            (t_type.format == Tformat::NCHW) ? d3 : 1, t_type){};
 196
 197   /**
 198    * @brief     Constructor of Tensor
 199    * @param[in] d data for the Tensor. It needs to set format properly.
 200    */
 201
 202   Tensor(std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
 203          ml::train::TensorDim::TensorType t_type) {
 204     if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
 205       throw std::out_of_range(
 206         "[Tensor] trying to initialize Tensor from empty vector");
 207     }
 208     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 209     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 210     // dim[1] == height, dim[2] == width, dim[3] == channel
 211     dim.setTensorDim(0, d.size());
 212     if (t_type.format == Tformat::NCHW) {
 213       dim.setTensorDim(1, d[0].size());
 214       dim.setTensorDim(2, d[0][0].size());
 215       dim.setTensorDim(3, d[0][0][0].size());
 216     } else {
 217       dim.setTensorDim(2, d[0].size());
 218       dim.setTensorDim(3, d[0][0].size());
 219       dim.setTensorDim(1, d[0][0][0].size());
 220     }
 221
 222     setTensorType(t_type);
 223
 224     strides = dim.computeStrides();
 225
 226     MemoryData *mem_data =
 227       new MemoryData((void *)(new float[dim.getDataLen()]()));
 228     data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
 229       delete[] mem_data->getAddr<float>();
 230     });
 231     offset = 0;
 232     contiguous = true;
 233     initializer = Initializer::NONE;
 234
 235     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 236     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 237     // dim[1] == height, dim[2] == width, dim[3] == channel
 238     if (t_type.format == Tformat::NCHW) {
 239       for (unsigned int i = 0; i < batch(); ++i)
 240         for (unsigned int j = 0; j < channel(); ++j)
 241           for (unsigned int k = 0; k < height(); ++k)
 242             for (unsigned int l = 0; l < width(); ++l)
 243               this->setValue(i, j, k, l, d[i][j][k][l]);
 244     } else {
 245       for (unsigned int i = 0; i < batch(); ++i)
 246         for (unsigned int j = 0; j < height(); ++j)
 247           for (unsigned int k = 0; k < width(); ++k)
 248             for (unsigned int l = 0; l < channel(); ++l)
 249               this->setValue(i, l, j, k, d[i][j][k][l]);
 250     }
 251   };
 252
 253   /**
 254    * @brief     Constructor of Tensor
 255    * @note      This constructor copies vector again. needs refactoring
 256    * @param[in] d data for the Tensor. It needs to set format properly.
 257    */
 258   Tensor(std::vector<std::vector<std::vector<float>>> const &d,
 259          ml::train::TensorDim::TensorType t_type) :
 260     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 261
 262   /**
 263    * @brief     Constructor of Tensor
 264    * @note      This constructor copies vector again. needs refactoring
 265    * @param[in] d data for the Tensor with batch size one
 266    */
 267   Tensor(std::vector<std::vector<float>> const &d,
 268          ml::train::TensorDim::TensorType t_type) :
 269     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 270
 271 #ifdef ENABLE_FP16
 272   Tensor(std::vector<std::vector<std::vector<std::vector<__fp16>>>> const &d,
 273          ml::train::TensorDim::TensorType t_type) {
 274
 275     if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
 276       throw std::out_of_range(
 277         "[Tensor] trying to initialize Tensor from empty vector");
 278     }
 279
 280     dim.setTensorDim(0, d.size());
 281     if (t_type.format == Tformat::NCHW) {
 282       dim.setTensorDim(1, d[0].size());
 283       dim.setTensorDim(2, d[0][0].size());
 284       dim.setTensorDim(3, d[0][0][0].size());
 285     } else {
 286       dim.setTensorDim(2, d[0].size());
 287       dim.setTensorDim(3, d[0][0].size());
 288       dim.setTensorDim(1, d[0][0][0].size());
 289     }
 290
 291     setTensorType(t_type);
 292
 293     strides = dim.computeStrides();
 294
 295     MemoryData *mem_data =
 296       new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
 297     data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
 298       delete[] mem_data->getAddr<__fp16>();
 299     });
 300     offset = 0;
 301     contiguous = true;
 302     initializer = Initializer::NONE;
 303
 304     setDataType(Tdatatype::FP16);
 305
 306     // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
 307     // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
 308     // dim[1] == height, dim[2] == width, dim[3] == channel
 309     if (t_type.format == Tformat::NCHW) {
 310       for (unsigned int i = 0; i < batch(); ++i)
 311         for (unsigned int j = 0; j < channel(); ++j)
 312           for (unsigned int k = 0; k < height(); ++k)
 313             for (unsigned int l = 0; l < width(); ++l)
 314               this->setValue(i, j, k, l, d[i][j][k][l]);
 315     } else {
 316       for (unsigned int i = 0; i < batch(); ++i)
 317         for (unsigned int j = 0; j < height(); ++j)
 318           for (unsigned int k = 0; k < width(); ++k)
 319             for (unsigned int l = 0; l < channel(); ++l)
 320               this->setValue(i, l, j, k, d[i][j][k][l]);
 321     }
 322   };
 323
 324   /**
 325    * @brief     Constructor of Tensor
 326    * @note      This constructor copies vector again. needs refactoring
 327    * @param[in] d data for the Tensor
 328    */
 329   Tensor(std::vector<std::vector<std::vector<__fp16>>> const &d,
 330          ml::train::TensorDim::TensorType t_type) :
 331     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 332
 333   /**
 334    * @brief     Constructor of Tensor
 335    * @note      This constructor copies vector again. needs refactoring
 336    * @param[in] d data for the Tensor with batch size one
 337    */
 338   Tensor(std::vector<std::vector<__fp16>> const &d,
 339          ml::train::TensorDim::TensorType t_type) :
 340     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
 341
 342 #endif
 343
 344   /**
 345    *  @brief  Copy constructor of Tensor.
 346    *  @param[in] Tensor &
 347    */
 348   Tensor(const Tensor &rhs) = default;
 349
 350   /**
 351    *  @brief  Move constructor of Tensor.
 352    *  @param[in] Tensor &&
 353    */
 354   Tensor(Tensor &&rhs) noexcept = default;
 355
 356   /**
 357    * @brief  Copy assignment operator.
 358    * @param[in] rhs Tensor to be copied.
 359    */
 360   Tensor &operator=(const Tensor &rhs) = default;
 361
 362   /**
 363    * @brief  Move assignment operator.
 364    * @parma[in] rhs Tensor to be moved.
 365    */
 366   Tensor &operator=(Tensor &&rhs) noexcept = default;
 367
 368   /**
 369    * @brief Construct a new Tensor object from a buffer
 370    * This will not copy buffer to a new tensor but directly uses it
 371    *
 372    * @param buf buffer
 373    * @param bytes buffer size in bytes
 374    * @param d tensor dim
 375    * @param offset offset to be used from current
 376    * @return Tensor object
 377    * @throws std::invalid_argument if buf is null
 378    */
 379   template <typename T = float>
 380   static Tensor Map(T *buf, unsigned int bytes, const TensorDim &d,
 381                     size_t offset = 0) {
 382     if (d.getDataLen() == 0 || buf == nullptr) {
 383       throw std::invalid_argument(
 384         "[Tensor::Map] empty tensor dim is not allowed");
 385     }
 386
 387     if (d.getDataLen() * sizeof(T) + offset > bytes) {
 388       throw std::invalid_argument(
 389         "Creating shared tensor of size bigger than tensor memory.");
 390     }
 391
 392     Tensor tmp;
 393     tmp.dim = d;
 394     tmp.strides = d.computeStrides();
 395     /// Tensor does not own the memory
 396     tmp.data = std::shared_ptr<MemoryData>(new MemoryData((void *)buf),
 397                                            std::default_delete<MemoryData>());
 398     tmp.offset = offset;
 399
 400     return tmp;
 401   };
 402
 403   friend void swap(Tensor &lhs, Tensor &rhs) noexcept {
 404     std::swap(lhs.dim, rhs.dim);
 405     std::swap(lhs.strides, rhs.strides);
 406     std::swap(lhs.contiguous, rhs.contiguous);
 407     std::swap(lhs.initializer, rhs.initializer);
 408     std::swap(lhs.data, rhs.data);
 409     std::swap(lhs.name, rhs.name);
 410   }
 411
 412   /**
 413    * @brief     Comparison operator overload
 414    * @param[in] rhs Tensor to be compared with
 415    */
 416   bool operator==(const Tensor &rhs) const;
 417
 418   /**
 419    * @brief     Comparison operator overload
 420    * @param[in] rhs Tensor to be compared with
 421    */
 422   bool operator!=(const Tensor &rhs) const { return !(*this == rhs); }
 423
 424   /**
 425    * @brief    Allocate memory for this tensor
 426    */
 427   void allocate();
 428
 429   /**
 430    * @brief    Deallocate memory for this tensor
 431    * @note     This will not necessary free the memory as tensors share memory
 432    */
 433   void deallocate() {
 434     data = nullptr;
 435     offset = 0;
 436   }
 437
 438   /**
 439    * @brief    Check if the tensor has memory allocated/assigned/associated
 440    */
 441   bool isAllocated() const { return data != nullptr; }
 442
 443   /**
 444    * @brief     return value at specific location
 445    * @param[in] batch batch location
 446    * @param[in] c channel location
 447    * @param[in] h height location
 448    * @param[in] w width location
 449    */
 450   template <typename T = float>
 451   const T &getValue(unsigned int batch, unsigned int c, unsigned int h,
 452                     unsigned int w) const noexcept {
 453     return getValue<T>(getIndex(batch, c, h, w));
 454   }
 455
 456   template <typename T = float>
 457   T &getValue(unsigned int batch, unsigned int c, unsigned int h,
 458               unsigned int w) noexcept {
 459     return getValue<T>(getIndex(batch, c, h, w));
 460   }
 461
 462   /**
 463    * @brief     return value at specific location
 464    * @param[in] idx location
 465    */
 466   template <typename T = float>
 467   const T &getValue(unsigned int idx) const noexcept {
 468     return getData<T>()[idx];
 469   }
 470
 471   /**
 472    * @brief     return value at specific location
 473    * @param[in] idx location
 474    */
 475   template <typename T = float> T &getValue(unsigned int idx) noexcept {
 476     return getData<T>()[idx];
 477   }
 478
 479   /**
 480    * @brief Get the Value thinking that it is padded
 481    * for example, for the tensor (virtually padded) below,
 482    * getValue(0, 0, 2, 2, 1, 1, .0f) will return 5
 483    * padding available for height and width axis for now
 484    * 0 0 0 0 0
 485    * 0 1 2 3 0
 486    * 0 4 5 6 0
 487    * 0 7 8 9 0
 488    * 0 0 0 0 0
 489    * @param b batch index
 490    * @param c channel index
 491    * @param h height index
 492    * @param w width index
 493    * @param ph padding height
 494    * @param pw padding width
 495    * @return float value
 496    */
 497   template <typename T = float>
 498   const T getValuePaddedVirtual(unsigned int b, unsigned int c, unsigned int h,
 499                                 unsigned int w, unsigned int ph,
 500                                 unsigned int pw,
 501                                 T pad_value = 0) const EXCEPT_WHEN_DEBUG {
 502 #if DEBUG
 503     unsigned int padded_h = 2 * ph + h;
 504     unsigned int padded_w = 2 * pw + w;
 505     if (h > padded_h && w > padded_w) {
 506       throw std::out_of_range(
 507         "[Tensor::getValuePadded] trying to access out of range");
 508     }
 509 #endif
 510
 511     if (ph <= h && h < ph + height() && pw <= w && w < pw + width()) {
 512       return getValue<T>(b, c, h - ph, w - pw);
 513     }
 514
 515     return pad_value;
 516   }
 517
 518   /**
 519    * @brief     Multiply value element by element immediately
 520    * @param[in] value multiplier
 521    * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
 522    * @retval    #ML_ERROR_NONE Successful
 523    */
 524   int multiply_i(float const &value);
 525
 526   /**
 527    * @brief     Multiply value element by element
 528    * @param[in] value multiplier
 529    * @retval    Calculated Tensor
 530    */
 531   Tensor multiply(float const &value) const;
 532
 533   /**
 534    * @brief     multiply value element by element
 535    * @param[in] value multiplier
 536    * @param[out] out out tensor to store the result
 537    * @retval    Calculated Tensor
 538    */
 539   Tensor &multiply(float const &value, Tensor &out) const;
 540
 541   /**
 542    * @brief     Multiply Tensor Elementwise
 543    * @param[in] m Tensor to be multiplied
 544    * @param[in] beta scalar to multiply output with and add
 545    * @retval    #ML_ERROR_NONE successful
 546    */
 547   int multiply_i(Tensor const &m, const float beta = 0.0);
 548
 549   /**
 550    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 551    * @param[in] m Tensor to be multiplied
 552    * @param[in] beta scalar to multiply output with and add
 553    * @retval    Calculated Tensor
 554    */
 555   Tensor multiply(Tensor const &m, const float beta = 0.0) const;
 556
 557   /**
 558    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 559    * @param[in] m Tensor to be multiplied
 560    * @param[out] output Tensor to store the result
 561    * @param[in] beta scalar to multiply output with and add
 562    * @retval    Calculated Tensor
 563    */
 564   Tensor &multiply(Tensor const &m, Tensor &output,
 565                    const float beta = 0.0) const;
 566
 567   /**
 568    * @brief     Multiply Tensor Elementwise
 569    * @param[in] m Tensor to be multiplied
 570    * @param[in] beta scalar to multiply output with and add
 571    * @retval    #ML_ERROR_NONE successful
 572    *
 573    * @note support different strided inputs and output
 574    * @note does not support broadcasting
 575    *
 576    * @todo merge this to multiply_i
 577    */
 578   int multiply_i_strided(Tensor const &m, const float beta = 0.0);
 579
 580   /**
 581    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 582    * @param[in] m Tensor to be multiplied
 583    * @param[in] beta scalar to multiply output with and add
 584    * @retval    Calculated Tensor
 585    *
 586    * @note support different strided inputs and output
 587    * @note does not support broadcasting
 588    *
 589    * @todo merge this to multiply
 590    */
 591   Tensor multiply_strided(Tensor const &m, const float beta = 0.0) const;
 592
 593   /**
 594    * @brief     Multiply Tensor Element by Element ( Not the MxM )
 595    * @param[in] m Tensor to be multiplied
 596    * @param[out] output Tensor to store the result
 597    * @param[in] beta scalar to multiply output with and add
 598    * @retval    Calculated Tensor
 599    *
 600    * @note support different strided inputs and output
 601    * @note does not support broadcasting
 602    *
 603    * @todo merge this to multiply
 604    */
 605   Tensor &multiply_strided(Tensor const &m, Tensor &output,
 606                            const float beta = 0.0) const;
 607
 608   /**
 609    * @brief     Add Tensor Elementwise
 610    * @param[in] m Tensor to be added
 611    * @param[in] beta scalar to add output with and add
 612    * @retval    #ML_ERROR_NONE successful
 613    *
 614    * @note support different strided inputs and output
 615    * @note does not support broadcasting
 616    *
 617    * @todo merge this to add_i
 618    */
 619   int add_i_strided(Tensor const &m, const float beta = 0.0);
 620
 621   /**
 622    * @brief     Add Tensor Element by Element
 623    * @param[in] m Tensor to be added
 624    * @param[in] beta Value to be scale the added tensor
 625    * @retval    Calculated Tensor
 626    *
 627    * @note support different strided inputs and output
 628    * @note does not support broadcasting
 629    *
 630    * @todo merge this to add
 631    */
 632   Tensor add_strided(Tensor const &m, const float beta = 0.0) const;
 633
 634   /**
 635    * @brief     Add Tensor Element by Element
 636    * @param[in] m Tensor to be added
 637    * @param[out] output Tensor to store the result
 638    * @param[in] beta Value to be scale the added tensor
 639    * @retval    Calculated Tensor
 640    *
 641    * @note support different strided inputs and output
 642    * @note does not support broadcasting
 643    *
 644    * @todo merge this to add
 645    */
 646   Tensor &add_strided(Tensor const &m, Tensor &output,
 647                       const float beta = 0.0) const;
 648
 649   /**
 650    * @brief     Divide value element by element immediately
 651    * @param[in] value divisor
 652    * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
 653    * @retval    #ML_ERROR_NONE Successful
 654    */
 655   int divide_i(float const &value);
 656
 657   /**
 658    * @brief     Divide value element by element
 659    * @param[in] value Divisor
 660    * @retval    Calculated Tensor
 661    */
 662   Tensor divide(float const &value) const;
 663
 664   /**
 665    * @brief     Divide value element by element
 666    * @param[in] value Divisor
 667    * @param[out] out out parameter to store the result
 668    * @retval    Calculated Tensor
 669    */
 670   Tensor &divide(float const &value, Tensor &out) const;
 671
 672   /**
 673    * @brief     divide Tensor Elementwise
 674    * @param[in] m Tensor to be multiplied
 675    * @retval    #ML_ERROR_NONE successful
 676    */
 677   int divide_i(Tensor const &m);
 678
 679   /**
 680    * @brief     Divide Tensor Element by Element
 681    * @param[in] m Divisor Tensor
 682    * @retval    Calculated Tensor
 683    */
 684   Tensor divide(Tensor const &m) const;
 685
 686   /**
 687    * @brief     divide Tensor Elementwise
 688    * @param[in] m Tensor to be multiplied
 689    * @param[out] output Tensor to store the result
 690    * @retval    Calculated Tensor
 691    */
 692   Tensor &divide(Tensor const &m, Tensor &output) const;
 693
 694   /**
 695    * @brief Add Tensor Element immediately to target tensor without mem copy
 696    * @param[in] value value to be added
 697    * @retval #ML_ERROR_NONE  Successful
 698    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 699    */
 700   int add_i(float const &value);
 701
 702   /**
 703    * @brief     Add value Element by Element
 704    * @param[in] value value to be added
 705    * @retval    Calculated Tensor
 706    */
 707   Tensor add(float const &value) const;
 708
 709   /**
 710    * @brief     Add Tensor Element by Element
 711    * @param[in] value value to be added
 712    * @param[out] out Tensor to save output without allocating new memory
 713    * @retval    Calculated Tensor
 714    */
 715   Tensor &add(float const &value, Tensor &out) const;
 716
 717   /**
 718    * @brief Add Tensor Element by Element without mem copy
 719    * @param[in] m Tensor to be added
 720    * @param[out] alpha Values to be scaled
 721    * @retval #ML_ERROR_NONE  Successful
 722    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 723    */
 724   int add_i(Tensor const &m, float const alpha = 1);
 725
 726   /**
 727    * @brief     Add Tensor Element by Element
 728    * @param[in] m Tensor to be added
 729    * @retval    Calculated Tensor
 730    */
 731   Tensor add(Tensor const &m, float const alpha = 1) const;
 732
 733   /**
 734    * @brief     Add Tensor Element by Element
 735    * @param[in] m Tensor to be added
 736    * @param[out] m Tensor to be out
 737    * @retval    Calculated Tensor
 738    */
 739   Tensor &add(Tensor const &m, Tensor &out, float const alpha = 1) const;
 740
 741   /**
 742    * @brief     memcpyless version of subtract
 743    * @param[in] value value to subtract
 744    * @retval #ML_ERROR_NONE  Successful
 745    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 746    */
 747   int subtract_i(float const &value);
 748
 749   /**
 750    * @brief     subtract value Element by Element
 751    * @param[in] value value to be subtracted
 752    * @retval    Calculated Tensor
 753    */
 754   Tensor subtract(float const &value) const;
 755
 756   /**
 757    * @brief     Subtract Tensor Element by Element
 758    * @param[in] value value to be added
 759    * @param[out] out Tensor to save output without allocating new memory
 760    * @retval    Calculated Tensor
 761    */
 762   Tensor &subtract(float const &value, Tensor &out) const;
 763
 764   /**
 765    * @brief     memcpyless version of subtract
 766    * @param[in] m Tensor to be subtracted
 767    * @retval #ML_ERROR_NONE  Successful
 768    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
 769    */
 770   int subtract_i(Tensor const &m);
 771
 772   /**
 773    * @brief     Substract Tensor Element by Element
 774    * @param[in] m Tensor to be subtracted
 775    * @retval    Calculated Tensor
 776    */
 777   Tensor subtract(Tensor const &m) const;
 778
 779   /**
 780    * @brief     Subtract Tensor Element by Element
 781    * @param[in] m Tensor to be added
 782    * @param[out] m Tensor to be out
 783    * @retval    Calculated Tensor
 784    */
 785   Tensor &subtract(Tensor const &m, Tensor &out) const;
 786
 787   /**
 788    * @brief Tensor power elementwise
 789    *
 790    * @param exponent exponent
 791    * @return int ML_ERROR_NONE if successful
 792    */
 793   int pow_i(float exponent);
 794
 795   /**
 796    * @brief    Tensor power Element by Element
 797    * @param[in] exponent exponent
 798    * @retval Calculated Tensor
 799    */
 800   Tensor pow(float exponent) const;
 801
 802   /**
 803    * @brief    Tensor power Element by Element
 804    * @param[in] exponent exponent
 805    * @param[out] out out to store the result
 806    * @retval Calculated Tensor
 807    */
 808   Tensor &pow(float exponent, Tensor &out) const;
 809
 810   /**
 811    * @brief  gaussian error function
 812    * @return int ML_ERROR_NONE if successful
 813    */
 814   int erf_i();
 815
 816   /**
 817    * @brief    gaussian error function
 818    * @retval Calculated Tensor
 819    */
 820   Tensor erf() const;
 821
 822   /**
 823    * @brief    gaussian error function
 824    * @param[out] out out to store the result
 825    * @retval Calculated Tensor
 826    */
 827   Tensor &erf(Tensor &out) const;
 828
 829   unsigned int sizeofData() { return dim.getDataTypeSize(); }
 830
 831   /**
 832    * @brief     Dot Product of Tensor ( equal MxM )
 833    * @details   This applies dot of the last dimension of this and second-last
 834    * dimension of passed tensor m.
 835    * @param[in] m Tensor
 836    * @param[in] trans Transpose
 837    * @param[in] trans_m Transpose m
 838    * @retval    Calculated Tensor
 839    */
 840   Tensor dot(Tensor const &m, bool trans = false, bool trans_m = false) const;
 841
 842   /**
 843    * @brief     Dot Product of Tensor ( equal MxM )
 844    * @details   This applies dot of the last dimension of this and second-last
 845    * dimension of passed tensor m.
 846    * @param[in] m Tensor
 847    * @param[in] output output Tensor
 848    * @param[in] trans Transpose
 849    * @param[in] trans_m Transpose m
 850    * @param[in] beta beta
 851    * @retval    Calculated Tensor
 852    */
 853   Tensor &dot(Tensor const &m, Tensor &output, bool trans = false,
 854               bool trans_m = false, float beta = 0.0f) const;
 855
 856   /**
 857    * @brief compute the derivative of this in the current tensor
 858    * @param m same as given to the dot()
 859    * @param output_deriv the derivative of the output
 860    * @param[in] trans same as given to the dot()
 861    * @param[in] trans_m same as given to the dot()
 862    * @param[in] beta same as given to the dot()
 863    * @note This will compute the derivative in-place and will overwrite existing
 864    * data in the tensor
 865    */
 866   Tensor &dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
 867                           bool trans = false, bool trans_m = false,
 868                           float beta = 0.0f);
 869
 870   /**
 871    * @brief compute the derivative wrt m in the m tensor
 872    * @param m_deriv tensor where derivative wrt m will be stored
 873    * @param output_deriv the derivative of the output
 874    * @param[in] trans same as given to the dot()
 875    * @param[in] trans_m same as given to the dot()
 876    * @param[in] beta same as given to the dot()
 877    * @note The caller tensor must be the same tensor as the one which called the
 878    * dot() product.
 879    */
 880   Tensor &dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
 881                           bool trans = false, bool trans_m = false,
 882                           float beta = 0.0f) const;
 883
 884   /**
 885    * @copydoc Tensor::dot(Tensor const &m, Tensor &output, bool trans,
 886               bool trans_m, float beta) const
 887    * @details performs dot operation over a batch of inputs
 888    */
 889   Tensor &dotBatched(Tensor const &m, Tensor &result, bool trans = false,
 890                      bool trans_m = false, float beta = 0.0f) const;
 891
 892   /**
 893    * @copydoc Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const
 894    &output_deriv, bool trans, bool trans_m, float beta)
 895    */
 896   Tensor &dot_batched_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
 897                                   bool trans = false, bool trans_m = false,
 898                                   float beta = 0.0f);
 899
 900   /**
 901    * @brief Tensor::dot_deriv_wrt_2(Tensor const &m_deriv, Tensor const
 902    &output_deriv, bool trans, bool trans_m, float beta) const
 903    */
 904   Tensor &dot_batched_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
 905                                   bool trans = false, bool trans_m = false,
 906                                   float beta = 0.0f) const;
 907
 908   /**
 909    * @brief Transpose Tensor
 910    *
 911    * @param direction to transpose ex) 0:2:1
 912    * @return Tensor
 913    */
 914   Tensor transpose(const std::string &direction) const;
 915
 916   /**
 917    * @brief Transpose Tensor
 918    * @param direction to transpose ex) 0:2:1
 919    * @param[out] Tensor to save to, dimension is always reshaped.
 920    * @retval Tensor& reference to the out
 921    */
 922   Tensor &transpose(const std::string &direction, Tensor &out) const;
 923
 924   /**
 925    * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
 926    * @param dropout drop out rate
 927    * @retval Tensor& reference of drop out mask
 928    */
 929   Tensor dropout_mask(float dropout) const;
 930
 931   /**
 932    * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate) inplace
 933    * @param dropout drop out rate
 934    */
 935   void dropout_mask(float dropout);
 936
 937   /**
 938    * @brief Calculate filter mask
 939    * @param mask_len length of each mask along the last axis
 940    * @param invert invert the mask
 941    */
 942   void filter_mask(const Tensor &mask_len, bool reverse = false);
 943
 944   /**
 945    * @brief Calculate 2 Zone Out Mask
 946    * @details Calculate zone out mask according to the bernoulli distribution.
 947    * Zone out mask with rate @a zoneout for inplace and the other zone out mask
 948    * with rate @a (1-zoneout).
 949    * @param zoneout zone out rate
 950    * @retval Tensor zone out mask for opposite tensor
 951    */
 952   Tensor zoneout_mask(float zoneout);
 953
 954   /**
 955    * @brief Calculate 2 Zone Out Mask
 956    * @details Calculate zone out mask according to the bernoulli distribution.
 957    * Zone out mask with rate @a zoneout for inplace and the other zone out mask
 958    * with rate @a (1-zoneout).
 959    * @param opposite opposite zone out mask
 960    * @param zoneout zone out rate
 961    */
 962   void zoneout_mask(Tensor &opposite, float zoneout);
 963
 964   /**
 965    * @brief     sum all the Tensor elements according to the batch
 966    * @retval    Calculated Tensor(batch, 1, 1, 1)
 967    */
 968   Tensor sum_by_batch() const;
 969
 970   /**
 971    * @brief     sum all the Tensor elements according to the axis
 972    *            0 : batch direction
 973    *            1 : channel direction
 974    *            2 : height direction
 975    *            3 : width direction
 976    * @param[in] axis Axis to calculate sum along
 977    * @param[in] alpha Scale the sum by this value
 978    * @retval    Calculated Tensor
 979    */
 980   Tensor sum(unsigned int axis, float alpha = 1.0) const;
 981
 982   /**
 983    * @brief     sum all the Tensor elements according to the axis
 984    *            0 : batch direction
 985    *            1 : channel direction
 986    *            2 : height direction
 987    *            3 : width direction
 988    * @param[in] axis Axis to calculate sum along
 989    * @param[out] output output tensor
 990    * @param[in] alpha Scale the sum by this value
 991    * @retval    Calculated Tensor
 992    */
 993   Tensor &sum(unsigned int axis, Tensor &output, float alpha = 1.0,
 994               float beta = 0.0) const;
 995
 996   /**
 997    * @brief sum all the Tensor by multiple axes
 998    *
 999    * @param axes axes to sum along
1000    * @param alpha Scale the sum by this value
1001    * @return Tensor
1002    */
1003   Tensor sum(const std::vector<unsigned int> &axes, float alpha = 1.0) const;
1004
1005   /**
1006    * @brief sum all the Tensor by multiple axes
1007    *
1008    * @param axes axes to sum along
1009    * @param[out] output output tensor
1010    * @param alpha Scale the sum by this value
1011    * @return Tensor
1012    */
1013   Tensor &sum(const std::vector<unsigned int> &axes, Tensor &output,
1014               float alpha = 1.0) const;
1015
1016   /**
1017    * @brief     Averaging the Tensor elements according to the axis
1018    *            0 : batch direction
1019    *            1 : channel direction
1020    *            2 : height direction
1021    *            3 : width direction
1022    * @retval    Calculated Tensor
1023    */
1024   Tensor average(unsigned int axis) const;
1025   /**
1026    * @brief     Averaging the Tensor elements according to the axis
1027    *
1028    * @retval    Calculated Tensor
1029    */
1030   Tensor &average(unsigned int axis, Tensor &output) const;
1031
1032   /**
1033    * @brief average all the Tensor by multiple axes
1034    *
1035    * @param axes axes to sum along
1036    * @return Tensor
1037    */
1038   Tensor average(const std::vector<unsigned int> &axes) const;
1039
1040   /**
1041    * @brief average all the Tensor by multiple axes
1042    *
1043    * @param axes axes to sum along
1044    * @param output output tensor
1045    * @return Tensor
1046    */
1047   Tensor &average(const std::vector<unsigned int> &axes, Tensor &output) const;
1048
1049   /**
1050    * @brief     Averaging the Tensor elements by all axis
1051    * @retval    Calculated Tensor
1052    */
1053   Tensor average() const;
1054
1055   /**
1056    * @brief     Averaging the Tensor elements by all axis
1057    * @retval    Calculated Tensor
1058    */
1059   Tensor &average(Tensor &output) const;
1060
1061   /**
1062    * @brief     Anchor a starting point to defer following evaluation
1063    * @retval    LazyTensor class that can be used with run();
1064    */
1065   LazyTensor chain() const;
1066
1067   /**
1068    * @brief     Softmax the Tensor elements
1069    * @retval    Calculated Tensor
1070    */
1071   Tensor softmax() const;
1072
1073   /**
1074    * @brief     l2norm the Tensor elements
1075    * @retval    Calculated l2norm
1076    */
1077   float l2norm() const;
1078
1079   /**
1080    * @brief     Normalize the Tensor elements
1081    * @retval    Calculated Tensor
1082    */
1083   Tensor &normalization(Tensor &output) const;
1084
1085   /**
1086    * @brief     Standardize the Tensor elements
1087    * @retval    Calculated Tensor
1088    */
1089   Tensor &standardization(Tensor &output) const;
1090
1091   /**
1092    * @brief     Normalize the Tensor elements in-place
1093    * @retval    Calculated Tensor
1094    */
1095   void normalization_i();
1096
1097   /**
1098    * @brief     Standardize the Tensor elements in-place
1099    * @retval    Calculated Tensor
1100    */
1101   void standardization_i();
1102
1103   template <typename T = float> T *getAddress(unsigned int i) {
1104     size_t index = getIndex(batch(), channel(), height(), width());
1105     if (i > index) {
1106       return nullptr;
1107     }
1108     return &getData<T>()[i];
1109   }
1110
1111   /**
1112    * @brief     i data index
1113    * @retval    address of ith data
1114    */
1115   template <typename T = float> const T *getAddress(unsigned int i) const {
1116     size_t index = getIndex(batch(), channel(), height(), width());
1117     if (i > index) {
1118       return nullptr;
1119     }
1120
1121     return &getData<T>()[i];
1122   }
1123
1124   /**
1125    * @brief    get address of n-d data
1126    */
1127   template <typename T = float>
1128   T *getAddress(unsigned int b, unsigned int c, unsigned int h,
1129                 unsigned int w) {
1130     return getAddress<T>(getIndex(b, c, h, w));
1131   }
1132
1133   /**
1134    * @brief    get address of n-d data
1135    */
1136   template <typename T = float>
1137   const T *getAddress(unsigned int b, unsigned int c, unsigned int h,
1138                       unsigned int w) const {
1139     return getAddress<T>(getIndex(b, c, h, w));
1140   }
1141
1142   /**
1143    * @brief Apply instantly to the element
1144    *
1145    * @param f function to apply
1146    * @return int ML_ERROR_NONE if successful
1147    */
1148   int apply_i(std::function<float(float)> f) {
1149     Tensor result = *this;
1150     apply(f, result);
1151
1152     return ML_ERROR_NONE;
1153   };
1154
1155   /**
1156    * @brief     Apply function element by element
1157    * @param[in] *function function pointer applied
1158    * @retval    Tensor
1159    */
1160   Tensor apply(std::function<float(float)> f) const {
1161     Tensor result;
1162     return apply(f, result);
1163   };
1164
1165   /**
1166    * @brief     Apply function element by element
1167    * @param[in] *function function pointer applied
1168    * @param[out] output output tensor
1169    * @retval    Tensor
1170    */
1171   Tensor &apply(std::function<float(float)> f, Tensor &output) const {
1172     CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
1173
1174     if (dim != output.dim) {
1175       /// @todo add unittest
1176       throw std::invalid_argument(
1177         "[Tensor::apply] output dimension does not match");
1178     }
1179
1180     if (dim.getDataType() == Tdatatype::FP32) {
1181       if (contiguous && output.contiguous) {
1182         const float *data = (getData<float>());
1183         float *rdata = (output.getData<float>());
1184
1185         std::transform(data, data + size(), rdata, f);
1186       } else if (strides[3] == 1 && output.strides[3] == 1) {
1187         /** @todo optimize this with combining these loops where stride is 1 */
1188         for (unsigned int b = 0; b < batch(); ++b) {
1189           for (unsigned int c = 0; c < channel(); ++c) {
1190             for (unsigned int h = 0; h < height(); ++h) {
1191               float *out_data = output.getAddress<float>(b, c, h, 0);
1192               const float *in_data = getAddress<float>(b, c, h, 0);
1193               std::transform(in_data, in_data + width(), out_data, f);
1194             }
1195           }
1196         }
1197       } else {
1198         for (unsigned int b = 0; b < batch(); ++b) {
1199           for (unsigned int c = 0; c < channel(); ++c) {
1200             for (unsigned int h = 0; h < height(); ++h) {
1201               for (unsigned int w = 0; w < width(); ++w) {
1202                 output.setValue(b, c, h, w, f(getValue<float>(b, c, h, w)));
1203               }
1204             }
1205           }
1206         }
1207       }
1208     } else if (dim.getDataType() == Tdatatype::FP16) {
1209 #ifdef ENABLE_FP16
1210       if (contiguous && output.contiguous) {
1211         const __fp16 *data = (getData<__fp16>());
1212         __fp16 *rdata = (output.getData<__fp16>());
1213
1214         std::transform(data, data + size(), rdata, f);
1215       } else if (strides[3] == 1 && output.strides[3] == 1) {
1216         /** @todo optimize this with combining these loops where stride is 1 */
1217         for (unsigned int b = 0; b < batch(); ++b) {
1218           for (unsigned int c = 0; c < channel(); ++c) {
1219             for (unsigned int h = 0; h < height(); ++h) {
1220               __fp16 *out_data = (__fp16 *)output.getAddress(b, c, h, 0);
1221               const __fp16 *in_data = (__fp16 *)getAddress(b, c, h, 0);
1222               std::transform(in_data, in_data + width(), out_data, f);
1223             }
1224           }
1225         }
1226       } else {
1227         for (unsigned int b = 0; b < batch(); ++b) {
1228           for (unsigned int c = 0; c < channel(); ++c) {
1229             for (unsigned int h = 0; h < height(); ++h) {
1230               for (unsigned int w = 0; w < width(); ++w) {
1231                 output.setValue(b, c, h, w,
1232                                 f((float)((__fp16)getValue(b, c, h, w))));
1233               }
1234             }
1235           }
1236         }
1237       }
1238 #else
1239       throw std::invalid_argument("Error: enable-fp16 is not enabled");
1240 #endif
1241     }
1242     return output;
1243   };
1244
1245   /**
1246    * @brief     Apply function to Tensor
1247    * @param[in] *function function pointer applied
1248    * @retval    Tensor
1249    */
1250   Tensor apply(std::function<Tensor(Tensor)> f) const;
1251
1252   /**
1253    * @brief     Apply function to Tensor
1254    * @param[in] *function function pointer applied
1255    * @param[out] output output tensor
1256    * @retval    Tensor
1257    */
1258   Tensor &apply(std::function<Tensor &(Tensor, Tensor &)> f,
1259                 Tensor &output) const;
1260
1261   /**
1262    * @brief     Print element
1263    * @param[in] out out stream
1264    * @retval    Tensor
1265    */
1266   void print(std::ostream &out) const;
1267
1268   /**
1269    * @brief     Print element
1270    * @param[in] out out stream
1271    * @param[in] opt print formatting option. opt=0 would pretty print the data,
1272    * else it would print the raw data.
1273    * @retval    Tensor
1274    */
1275   void print_(std::ostream &out, uint opt = 0) const;
1276
1277   /**
1278    * @brief     Get size of current tensor
1279    * @retval    unsigned int size of the current tensor
1280    */
1281   size_t size() const { return dim.getDataLen(); }
1282
1283   /**
1284    * @brief     Get if the tensor is empty
1285    * @retval    true if the tensor is empty
1286    */
1287   bool empty() const { return size() == 0; }
1288
1289   /**
1290    * @brief     Get size of the data in bytes
1291    * @retval    size_t Size in bytes
1292    */
1293   size_t bytes() const { return size() * dim.getDataTypeSize(); }
1294
1295   /**
1296    * @brief     Set the element value
1297    * @param[in] batch batch location
1298    * @param[in] c channel location
1299    * @param[in] h height location
1300    * @param[in] w width location
1301    * @param[in] value value to be stored
1302    */
1303   void setValue(unsigned int batch, unsigned int c, unsigned int h,
1304                 unsigned int w, float value) noexcept {
1305     if (getDataType() == Tdatatype::FP32) {
1306       getData<float>()[getIndex(batch, c, h, w)] = value;
1307     } else if (getDataType() == Tdatatype::FP16) {
1308 #ifdef ENABLE_FP16
1309       getData<__fp16>()[getIndex(batch, c, h, w)] = value;
1310 #else
1311       ml_loge("%s", "Error: enable-fp16 is not enabled");
1312 #endif
1313     }
1314   }
1315
1316   /**
1317    * @brief     add the element value to the location
1318    * @param[in] batch batch location
1319    * @param[in] c channel location
1320    * @param[in] h height location
1321    * @param[in] w width location
1322    * @param[in] value value to be stored
1323    * @param[in] beta scalar to multiply output with and add
1324    */
1325   void addValue(unsigned int batch, unsigned int c, unsigned int h,
1326                 unsigned int w, float value, float beta) noexcept {
1327     auto const &idx = getIndex(batch, c, h, w);
1328     if (dim.getDataType() == Tdatatype::FP32) {
1329       getData<float>()[idx] *= beta;
1330       getData<float>()[idx] += value;
1331     } else if (dim.getDataType() == Tdatatype::FP16) {
1332 #ifdef ENABLE_FP16
1333       getData<__fp16>()[idx] *= beta;
1334       getData<__fp16>()[idx] += value;
1335 #else
1336       ml_loge("%s", "Error: enable-fp16 is not enabled");
1337 #endif
1338     }
1339   }
1340
1341   /**
1342    * @brief     Set the element value
1343    * @param[in] offset offset from start location
1344    * @param[in] value value to be stored
1345    *
1346    * @todo      This is a temporary workout. Remove this once multiple datatypes
1347    * are supported.
1348    */
1349   void setValueInt(unsigned int offset, int value) noexcept {
1350     int *data_int = (int *)getData();
1351     data_int[offset] = value;
1352   }
1353
1354   /**
1355    * @brief     Fill the Tensor elements with value
1356    * @param[in] value value to be stored
1357    */
1358   void setValue(float value);
1359
1360   /**
1361    * @brief     Fill the Tensor elements with zero
1362    */
1363   void setZero();
1364
1365   /**
1366    * @brief Set the Dist object
1367    *
1368    * @tparam T distrubution engine
1369    * @param dist distribution engine
1370    */
1371   template <typename T, typename Engine> void setDist(Engine dist) {
1372     NNTR_THROW_IF(!contiguous, std::invalid_argument)
1373       << getName() << " Tensor is not contiguous, cannot set distribution";
1374
1375     T *data_ = getData<T>();
1376     unsigned int len = size();
1377     for (unsigned int i = 0; i < len; ++i) {
1378       data_[i] = (T)dist(rng);
1379     }
1380   };
1381
1382   /**
1383    * @brief     Set the tensor with random normal distribution
1384    * @param[in] mean mean of the distribution
1385    * @param[in] std standard deviation of the distribution
1386    */
1387   void setRandNormal(float mean = 0.0f, float std = 0.05f);
1388
1389   /**
1390    * @brief     Set the tensor with random uniform distribution
1391    * @param[in] min minimum value for the distribution
1392    * @param[in] max maximum value for the distribution
1393    */
1394   void setRandUniform(float min = -0.05f, float max = 0.05f);
1395
1396   /**
1397    * @brief     Set the tensor with random bernoulli distribution
1398    * @param[in] probability probability value for the distribution
1399    */
1400   void setRandBernoulli(float probability = 0.5f);
1401
1402   /**
1403    * @brief     Initialize the memory of the given tensor
1404    */
1405   void initialize();
1406
1407   /**
1408    * @brief     Initialize the memory of the given tensor
1409    * @param     init Initiailizer to use for the initialization
1410    */
1411   void initialize(Initializer init) {
1412     initializer = init;
1413     initialize();
1414   }
1415
1416   /**
1417    * @brief     set the memory format
1418    * @param     fm format of Tensor
1419    */
1420   void convertFormat(TensorDim::Format fm) {
1421     if (getFormat() != fm) {
1422       transpose("2:1:0");
1423     }
1424
1425     dim.setFormat(fm);
1426   }
1427
1428   /**
1429    * @brief     Copy the Tensor
1430    * @param[in] from Tensor to be copied
1431    *
1432    * @note copy can reshape the tensor to match the shape
1433    */
1434   void copy(const Tensor &from);
1435
1436   /**
1437    * @brief     Copy the Tensor
1438    * @param[in] from Tensor to be copied
1439    */
1440   void copyData(const Tensor &from);
1441
1442   /**
1443    * @brief     Copy the Tensor
1444    * @param[in] from Tensor to be copied
1445    */
1446   void copy_with_stride(const Tensor &from);
1447
1448   /**
1449    * @brief Get slice of the tensor, sliced by batch
1450    * @param[in] offset offset in batch to start the slice
1451    * @param[in] size size of the slice
1452    * @retval slice of this tensor
1453    * @note This function provides a slice of this tensor, and does not create a
1454    * copy
1455    */
1456   Tensor getBatchSlice(size_t offset, unsigned int size) const;
1457
1458   /**
1459    * @brief Get new tensor which shares memory with current tensor but different
1460    * shape
1461    *
1462    * @param dim new dimension to be set for this tensor
1463    * @param offset offset to be used from the start of the data in elements
1464    * @note The new tensor will share the same data as the current tensor but
1465    * can have different size.
1466    * @note New size added with offset must be less than the size of the original
1467    * tensor.
1468    */
1469   Tensor getSharedDataTensor(const TensorDim dim, size_t offset,
1470                              bool reset_stride = true,
1471                              const std::string &name_ = "") const;
1472   /**
1473    * @brief split tensor along axis.
1474    *
1475    * @param num_size num_size
1476    * @param axis axis
1477    * @return Tensor splitted tensor
1478    */
1479   std::vector<Tensor> split(unsigned num_size, int axis = 0);
1480
1481   /**
1482    * @brief split tensor along axis.
1483    *
1484    * @param sizes sizes
1485    * @param axis axis
1486    * @return Tensor splitted tensor
1487    * @note if the given array sizes is just a 1 unsigned int value, assumes that
1488    * it divide tensor by given size evenly
1489    */
1490   std::vector<Tensor> split(std::vector<size_t> sizes, int axis = 0);
1491
1492   /**
1493    * @brief concatenate tensors along axis
1494    *
1495    * @param tensors tensors to be concatenated to the first tensor
1496    * @param axis axis
1497    * @return Tensor concatenated tensor
1498    */
1499   static Tensor cat(const std::vector<Tensor> &tensors, int axis = 0);
1500
1501   /**
1502    * @brief make this tensor share memory with given tensor
1503    *
1504    * @param src Source tensor whose memory is to be shared
1505    * @param offset offset to be used from the start of the data in bytes
1506    * @note This tensor will share the same data as the current tensor but
1507    * can have different size.
1508    * @note This tensor's size added with offset must be less than the size of
1509    * the source tensor.
1510    * @note The stride of the source tensor and this tensor must be same.
1511    */
1512   void makeSharedDataTensor(const Tensor &src, size_t offset = 0);
1513
1514   /**
1515    * @brief     Convient wrapper for inplace copy of @a this.
1516    * @retval    Copied version of this
1517    */
1518   Tensor clone() const;
1519
1520   /**
1521    * @brief     Save the Tensor into file
1522    * @param[in] file output file stream
1523    */
1524   void save(std::ostream &file);
1525
1526   /**
1527    * @brief     Read the Tensor from file
1528    * @param[in] file input file stream
1529    */
1530   void read(std::ifstream &file);
1531
1532   /**
1533    * @brief     return argument index which value is max by batch
1534    * @retval    unsigned int argument index
1535    */
1536   std::vector<unsigned int> argmax() const;
1537
1538   /**
1539    * @brief     return max of the absolute values of the tensor
1540    * @retval    maximum absolute value
1541    */
1542   float max_abs() const;
1543
1544   /**
1545    * @brief     return a copy of the Tensor Dim
1546    * @retval    TensorDim
1547    */
1548   TensorDim getDim() const { return TensorDim(dim); }
1549
1550   /**
1551    * @brief     return Tensor Dim for a given axis
1552    * @retval    dimension
1553    */
1554   size_t getTensorDim(unsigned int axis);
1555
1556   /**
1557    * @brief     return Tensor Type
1558    */
1559   TensorDim::TensorType getTensorType() const { return dim.getTensorType(); };
1560
1561   /**
1562    * @brief     return Tensor batch size
1563    * @retval    batch size
1564    */
1565   size_t batch() const { return dim.batch(); }
1566
1567   /**
1568    * @brief     return Tensor batch size
1569    * @retval    batch size
1570    */
1571   size_t channel() const { return dim.channel(); }
1572
1573   /**
1574    * @brief     return Tensor height size
1575    * @retval    height size
1576    */
1577   size_t height() const { return dim.height(); }
1578
1579   /**
1580    * @brief     return Tensor batch size
1581    * @retval    width size
1582    */
1583   size_t width() const { return dim.width(); }
1584
1585   /**
1586    * @brief     return Tensor Data Type Size
1587    * @retval    data type size
1588    */
1589   uint getDataTypeSize() const { return dim.getDataTypeSize(); }
1590
1591   /**
1592    * @brief     update batch size for this tensor
1593    * @param     batch size
1594    * @note      The batchsize of src_tensor need not be related with this
1595    * tensor's batch size
1596    *
1597    * @note      The memory for this tensor will re-allocated/re-assigned if the
1598    * updated batch size is different than the current batch size.
1599    *
1600    * @note      If this tensor is/was the src_tensor for some other, then
1601    * reduction in batch size can make the dependent tensors allocate fail due to
1602    * memory smaller. Caller must handle this in their own end.
1603    *
1604    * @note      If this tensor is re-allocated, then the memory might not be
1605    * immediately freed as the tensor already depending on this tensor also
1606    * share the same memory. So, the peak memory consumption in worst case can
1607    * reach the total memory requirements of a model with old batchsize and the
1608    * new batch size. It is recommended to first deallocate all the tensors,
1609    * updateBatch and then allocate again to avoid such issues.
1610    */
1611   void updateBatch(unsigned int batch) {
1612     if (dim.batch() == batch) {
1613       return;
1614     }
1615
1616     if (isAllocated())
1617       throw std::invalid_argument(
1618         "Cannot update batch for an allocated tensor");
1619     dim.batch(batch);
1620   }
1621
1622   /**
1623    * @brief     return Data pointer of Tensor
1624    * @retval    template T pointer (float pointer as default)
1625    */
1626   template <typename T = float> T *getData() {
1627     if (!data)
1628       return nullptr;
1629
1630     data->validate();
1631     return data->getAddr<T>() + offset;
1632   }
1633
1634   /**
1635    * @brief     return Data pointer of Tensor
1636    * @retval    template T pointer (float pointer as default)
1637    */
1638   template <typename T = float> const T *getData() const {
1639     if (!data)
1640       return nullptr;
1641
1642     data->validate();
1643     return data->getAddr<T>() + offset;
1644   }
1645
1646   /**
1647    * @brief     return Data pointer of Tensor
1648    * @retval    template T pointer (float pointer as default)
1649    */
1650   template <typename T = float> T *getData(size_t idx) const {
1651     if (!data)
1652       return nullptr;
1653
1654     size_t index = idx;
1655
1656     data->validate();
1657     return data->getAddr<T>() + offset + index;
1658   }
1659
1660   void setDataType(Tdatatype d_type) { dim.setDataType(d_type); }
1661
1662   void setTensorType(ml::train::TensorDim::TensorType t_type) {
1663     dim.setTensorType(t_type);
1664   }
1665
1666   /**
1667    * @brief     put data of Tensor
1668    *
1669    * @note      It is only effective when memory_swap is used
1670    */
1671   void putData() const {
1672     if (!data)
1673       return;
1674
1675     data->invalidate();
1676   }
1677
1678   /**
1679    * @brief     return Data pointer of Tensor
1680    * @retval    template T pointer (float pointer as default)
1681    */
1682   const std::shared_ptr<MemoryData> getMemoryData() const { return data; }
1683
1684   /**
1685    * @brief     return offset
1686    */
1687   size_t getOffset() const { return offset; }
1688
1689   /**
1690    * @brief     i data index
1691    * @retval    address of ith data
1692    */
1693   /**
1694    * @brief     set Tensor Dim
1695    * @param[in] d TensorDim
1696    * @note      Throws std::invalid_argument if size mismatch
1697    */
1698   void reshape(const TensorDim &d);
1699
1700   /**
1701    * @brief fill tensor data with current value,
1702    * if dimension is not exactly same, it is a hard error in this function
1703    * so, only stride is overriden to @a this
1704    *
1705    * @param from Tensor to fill the data from
1706    * @param allocate if unallocated, allocate with from.getDim()
1707    * @throws std::invalid_argument if dimension and stride does not match
1708    */
1709   void fill(const Tensor &from, bool allocate = false);
1710
1711   /**
1712    * @brief     return current stride of tensor.
1713    * @retval    int[MAXDIM] strides
1714    */
1715   const std::array<size_t, TensorDim::MAXDIM> getStrides() const noexcept {
1716     return strides;
1717   }
1718   /**
1719    * @brief Get linear index given the n-d index
1720    */
1721   inline size_t getIndex(unsigned int b, unsigned int c, unsigned int h,
1722                          unsigned int w) const noexcept {
1723     if (getFormat() == Tformat::NCHW) {
1724       return (b * strides[0] + c * strides[1] + h * strides[2] +
1725               w * strides[3]);
1726     } else {
1727       return (b * strides[0] + h * strides[1] + w * strides[2] +
1728               c * strides[3]);
1729     }
1730   }
1731
1732   /**
1733    * @brief Check if two given axes are contiguous
1734    */
1735   bool checkContinuous(unsigned int n, unsigned int np1) const {
1736     std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
1737     bool continuous = false;
1738     if (getFormat() == Tformat::NHWC) {
1739       if (continuous_order_nhwc[np1] == continuous_order_nhwc[n] + 1)
1740         continuous = true;
1741     } else {
1742       if (n + 1 == np1)
1743         continuous = true;
1744     }
1745     return continuous;
1746   }
1747
1748   /**
1749    * @brief   Get name of the tensor
1750    *
1751    * @return name of the tensor
1752    */
1753   void setName(const std::string &name_) { name = name_; }
1754
1755   /**
1756    * @brief   Get name of the tensor
1757    *
1758    * @return name of the tensor
1759    */
1760   const std::string &getName() const { return name; }
1761
1762   /**
1763    * @brief Set the memory buffer for the tensor
1764    *
1765    * @param buf the memory buffer
1766    * @param init intialize the buffer
1767    */
1768   void setData(const std::shared_ptr<MemoryData> buf, size_t off = 0,
1769                bool init = false) {
1770     if (buf) {
1771       data = buf;
1772       offset = off;
1773       if (init)
1774         initialize();
1775     } else {
1776       data = nullptr;
1777       offset = 0;
1778     }
1779   }
1780
1781   /**
1782    * @brief Get initializer for the tensor
1783    *
1784    * @return initializer of the tensor
1785    */
1786   Tensor::Initializer getInitializer() const { return initializer; }
1787
1788   /**
1789    * @brief Get format for the tensor
1790    *
1791    * @return format of the tensor
1792    */
1793   TensorDim::Format getFormat() const { return dim.getFormat(); }
1794
1795   /**
1796    * @brief Get data type for the tensor
1797    *
1798    * @return data type of the tensor
1799    */
1800   Tdatatype getDataType() const { return dim.getDataType(); }
1801
1802   static constexpr float epsilon = 1e-5;
1803
1804 private:
1805   /**< handle the data as a std::shared_ptr<float> type */
1806   TensorDim dim;
1807   std::array<size_t, TensorDim::MAXDIM> strides;
1808   bool contiguous;
1809   Tensor::Initializer initializer;
1810   std::string name; /**< name of the tensor */
1811   std::shared_ptr<MemoryData> data;
1812   size_t offset;
1813
1814   /**<
1815    * When using shared_data with tensor, this stores the ptr of the source
1816    * tensor which handles the full memory. If tensor data is already allocated,
1817    * this does not affect the tensor. If the tensor data is not allocated, and
1818    * src_ptr is valid, this tensor will use the memory allocated by the src_ptr
1819    */
1820   std::shared_ptr<SrcSharedTensor> src_tensor;
1821
1822   struct BroadcastInfo;
1823
1824   /**
1825    * @brief Applies the given operator to the tensor with the passed argument
1826    * @param[in] m Tensor
1827    * @param[in] v_func vectorized function to apply
1828    * @param e broadcast info.
1829    * @param cur_axis current axis. pass default when calling outside.
1830    * @param offset offset for this.  pass default when calling outside.
1831    * @param m_offset offset for m.  pass default when calling outside.
1832    * @retval #ML_ERROR_NONE Successful
1833    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
1834    */
1835   void
1836   apply_broadcast_util(Tensor const &m,
1837                        std::function<void(const BroadcastInfo &e, const float *,
1838                                           const float *, float *)>
1839                          v_func,
1840                        Tensor &output, const BroadcastInfo &e,
1841                        int cur_axis = -1, size_t offset = 0,
1842                        size_t m_offset = 0) const;
1843
1844   /**
1845    * @brief Applies the given operator to the tensor with the passed argument
1846    *
1847    * @param[in] m Tensor
1848    * @param[in] v_func vectorized function to apply
1849    * @retval #ML_ERROR_NONE Successful
1850    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
1851    */
1852   void apply_broadcast(Tensor const &m,
1853                        std::function<void(const BroadcastInfo &e, const float *,
1854                                           const float *, float *)>
1855                          v_func,
1856                        Tensor &output) const;
1857 #ifdef ENABLE_FP16
1858   void apply_broadcast_util(
1859     Tensor const &m,
1860     std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1861                        __fp16 *)>
1862       v_func,
1863     Tensor &output, const BroadcastInfo &e, int cur_axis = -1,
1864     size_t offset = 0, size_t m_offset = 0) const;
1865
1866   void
1867   apply_broadcast(Tensor const &m,
1868                   std::function<void(const BroadcastInfo &e, const __fp16 *,
1869                                      const __fp16 *, __fp16 *)>
1870                     v_func,
1871                   Tensor &output) const;
1872 #endif
1873   /**
1874    * @brief compute Loop info for broadcasting and vectorization
1875    *
1876    * @param m target tensor to be calculated against.
1877    * @return BroadcastInfo Loopinfo needed to run external loop
1878    */
1879   BroadcastInfo computeBroadcastInfo(const Tensor &m) const;
1880
1881   /**
1882    * @brief copy a buffer to @a this, the caller has to ensure that @a this is
1883    * initialized otherwise undefined behavior
1884    *
1885    * @param buf buffer to copy from
1886    */
1887   void copy(const void *buf);
1888
1889   /**
1890    * @brief Update destination tensor to share memory with source tensor
1891    *
1892    * @param src src tensor containing the memory
1893    * @param dest destination tensor which will share the memory
1894    * @param offset offset to be used from the start of the data in bytes
1895    * @note The new tensor will share the same data as the current tensor but
1896    * can have different size.
1897    * @note New size added with offset must be less than the size of the original
1898    * tensor.
1899    */
1900   static void createSharedDataTensor(const Tensor &src, Tensor &dest,
1901                                      size_t offset);
1902
1903   /**
1904    * @brief    Reallocate memory for this tensor
1905    * @note     This will not necessary free the memory as tensors share memory
1906    * @note     This can increase the peak memory consumption when callled on all
1907    * the tensors of a model sequentially. It is advised to first deallocate all
1908    * the tensors and then allocate, than reallocate tensors one by one.
1909    */
1910   void reallocate() {
1911     deallocate();
1912     allocate();
1913   }
1914
1915   /**
1916    * @brief Merge the given two axis for tensor at second axis inplace
1917    *
1918    * @param axis1 first axis to merge
1919    * @param axis2 second axis to merge
1920    */
1921   void mergeAxis(unsigned int axis1, unsigned int axis2);
1922
1923   /**
1924    * @brief     rotate 180 dgree
1925    * @param[in] in input Tensor
1926    * @retVal Tensor rotated tensor (180 degree)
1927    */
1928   Tensor rotate_180(Tensor in);
1929
1930 }; // namespace nntrainer
1931
1932 /**
1933  * @brief   Overriding output stream
1934  */
1935 std::ostream &operator<<(std::ostream &out, Tensor const &m);
1936
1937 typedef std::shared_ptr<Tensor> sharedTensor;
1938
1939 typedef std::shared_ptr<const Tensor> sharedConstTensor;
1940
1941 typedef std::vector<sharedConstTensor> sharedConstTensors;
1942
1943 typedef std::vector<sharedTensor> sharedTensors;
1944
1945 } /* namespace nntrainer */
1946
1947 #endif /* __cplusplus */
1948 #endif /* __TENSOR_H__ */