nntrainer/tensor/tensor.cpp

   1 /**
   2  * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *   http://www.apache.org/licenses/LICENSE-2.0
   8  * Unless required by applicable law or agreed to in writing, software
   9  * distributed under the License is distributed on an "AS IS" BASIS,
  10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11  * See the License for the specific language governing permissions and
  12  * limitations under the License.
  13  *
  14  *
  15  * @file        tensor.cpp
  16  * @date        04 December 2019
  17  * @brief       This is Tensor class for calculation
  18  * @see         https://github.com/nnstreamer/nntrainer
  19  * @author      Jijoong Moon <jijoong.moon@samsung.com>
  20  * @bug         No known bugs except for NYI items
  21  *
  22  */
  23
  24 #include <algorithm>
  25 #include <assert.h>
  26 #include <cmath>
  27 #include <cstring>
  28 #include <fstream>
  29 #include <iomanip>
  30 #include <iostream>
  31 #include <iterator>
  32 #include <numeric>
  33 #include <regex>
  34 #include <sstream>
  35 #include <stdexcept>
  36 #include <stdio.h>
  37
  38 #include <lazy_tensor.h>
  39 #include <tensor.h>
  40 #include <util_func.h>
  41
  42 #define transposeloop(cl, ci, cj, ck, sl, si, sj, sk)                 \
  43   do {                                                                \
  44     unsigned int i, j, k, l;                                          \
  45     int inidx = 0, outidx = 0;                                        \
  46     for (cl = 0; cl < sl; cl++)                                       \
  47       for (ci = 0; ci < si; ci++)                                     \
  48         for (cj = 0; cj < sj; cj++)                                   \
  49           for (ck = 0; ck < sk; ck++) {                               \
  50             outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
  51             inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k;      \
  52             outptr[outidx] = inptr[inidx];                            \
  53           }                                                           \
  54   } while (0);
  55
  56 #define transposeloop_nhwc(cl, ci, cj, ck, sl, si, sj, sk)            \
  57   do {                                                                \
  58     unsigned int i, j, k, l;                                          \
  59     int inidx = 0, outidx = 0;                                        \
  60     for (cl = 0; cl < sl; cl++)                                       \
  61       for (ci = 0; ci < si; ci++)                                     \
  62         for (cj = 0; cj < sj; cj++)                                   \
  63           for (ck = 0; ck < sk; ck++) {                               \
  64             outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
  65             inidx = l * SJ * SK * SI + j * SK * SI + k * SI + i;      \
  66             outptr[outidx] = inptr[inidx];                            \
  67           }                                                           \
  68   } while (0);
  69
  70 namespace nntrainer {
  71
  72 /**
  73  * @struct External Loop Info for broadcasted info
  74  * @brief External Loop Info for broadcasted iteration. Please refer to
  75  * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
  76  * @note This should better be implemented in iterator fashion before used
  77  * extensively.
  78  */
  79 struct Tensor::BroadcastInfo {
  80
  81   /**
  82    * @brief Construct a new External Loop Info object
  83    *
  84    */
  85   BroadcastInfo() :
  86     buffer_size(0),
  87     buffer_axis(-1),
  88     strides{0, 0, 0, 0},
  89     tensor_type(nntrainer::TensorDim::TensorType()) {}
  90
  91   unsigned int buffer_size; /**< virtual size of the buffer */
  92   int buffer_axis;          /**< the smallest axis that should be looped.
  93                                  -1 means no loop needed*/
  94   std::array<unsigned int, TensorDim::MAXDIM>
  95     strides; /**< modified strides for the loop */
  96   nntrainer::TensorDim::TensorType tensor_type;
  97 };
  98
  99 Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
 100                std::string name_) :
 101   Tensor(name_) {
 102   if (d.getDataLen() != 0) {
 103     dim = d;
 104     strides = d.computeStrides();
 105     initializer = init;
 106     if (alloc_now)
 107       allocate();
 108   }
 109 }
 110
 111 Tensor::Tensor(const TensorDim &d, const void *buf) : Tensor(d, true) {
 112   if (d.getDataLen() != 0) {
 113     if (buf != nullptr)
 114       copy(buf);
 115   }
 116 }
 117
 118 /**
 119  * @class SrcSharedTensor
 120  * @brief Source of the shared tensor
 121  */
 122 class SrcSharedTensor {
 123 public:
 124   /**
 125    * @brief   Constructor for the class
 126    */
 127   SrcSharedTensor() : src(nullptr), off(0) {}
 128
 129   SrcSharedTensor(const Tensor *tensor, size_t offset) :
 130     src(tensor),
 131     off(offset) {}
 132
 133   /**
 134    * @brief   Get the allocated src tensor
 135    */
 136   const Tensor *tensor() const {
 137     if (!src)
 138       throw std::runtime_error("Accessing empty src tensor");
 139
 140     return src;
 141   }
 142
 143   /**
 144    * @brief   Get the offset from the source tensor
 145    */
 146   size_t offset() const { return off; }
 147
 148 private:
 149   const Tensor *src; /**< Tensor of the source */
 150   size_t off;        /**< offset from the source data ptr */
 151 };
 152
 153 void Tensor::allocate() {
 154   if (empty() || data)
 155     /// already allocated
 156     return;
 157
 158   if (src_tensor) {
 159     /// allocate data based on the source tensor
 160     data = src_tensor->tensor()->data;
 161     offset = src_tensor->tensor()->offset + src_tensor->offset();
 162     /** as this memory is shared, do NOT initialize */
 163   } else {
 164     /// allocate new memory for the tensor data
 165
 166     MemoryData *mem_data;
 167
 168     if (getDataType() == ml::train::TensorDim::DataType::FP32) {
 169       mem_data = new MemoryData((void *)(new float[dim.getDataLen()]()));
 170       data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
 171         delete[](float *) mem_data->getAddr();
 172         delete mem_data;
 173       });
 174
 175     } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
 176 #ifdef ENABLE_FP16
 177       mem_data = new MemoryData((void *)(new __fp16[dim.getDataLen()]()));
 178       data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
 179         delete[](__fp16 *) mem_data->getAddr();
 180         delete mem_data;
 181       });
 182 #else
 183       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 184 #endif
 185     }
 186     offset = 0;
 187     initialize();
 188   }
 189 }
 190
 191 bool Tensor::operator==(const Tensor &rhs) const {
 192   if (this->dim != rhs.dim)
 193     return false;
 194
 195   size_t len = size();
 196
 197   if (len != rhs.size())
 198     return false;
 199
 200   if (contiguous != rhs.contiguous)
 201     return false;
 202
 203   if (strides != rhs.strides)
 204     return false;
 205
 206   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 207     const float *_data = getData<float>();
 208     const float *_rdata = rhs.getData<float>();
 209     for (size_t i = 0; i < len; ++i) {
 210       /** not checking sign change is intentional to avoid float calculation
 211        * errors around 0 */
 212       if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
 213           (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
 214           std::fabs(_data[i] - _rdata[i]) > epsilon)
 215         return false;
 216     }
 217   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 218 #ifdef ENABLE_FP16
 219     const __fp16 *_data = getData<__fp16>();
 220     const __fp16 *_rdata = rhs.getData<__fp16>();
 221     for (size_t i = 0; i < len; ++i) {
 222       if ((std::isnan(_data[i]) && !std::isnan(_rdata[i])) ||
 223           (!std::isnan(_data[i]) && std::isnan(_rdata[i])) ||
 224           std::fabs(_data[i] - _rdata[i]) > epsilon)
 225         return false;
 226     }
 227 #else
 228     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 229 #endif
 230   }
 231
 232   return true;
 233 }
 234
 235 void Tensor::setRandNormal(float mean, float std) {
 236   if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
 237     setDist<float, std::normal_distribution<float>>(
 238       std::normal_distribution<float>(mean, std));
 239   } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
 240     throw std::invalid_argument(
 241       "__fp16 is not supported by std::normal_distribution");
 242   }
 243 }
 244
 245 void Tensor::setRandUniform(float min, float max) {
 246   if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
 247     setDist<float, std::uniform_real_distribution<float>>(
 248       std::uniform_real_distribution<float>(min, max));
 249   } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
 250     throw std::invalid_argument(
 251       "__fp16 is not supported by std::uniform_real_distribution");
 252   }
 253 }
 254
 255 void Tensor::setRandBernoulli(float probability) {
 256   if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
 257     setDist<float, std::bernoulli_distribution>(
 258       std::bernoulli_distribution(probability));
 259   } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
 260 #ifdef ENABLE_FP16
 261     setDist<__fp16, std::bernoulli_distribution>(
 262       std::bernoulli_distribution((__fp16)probability));
 263 #else
 264     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 265 #endif
 266   }
 267 }
 268
 269 void Tensor::initialize() {
 270   if (empty() || !isAllocated())
 271     return;
 272
 273   unsigned int fan_in, fan_out;
 274
 275   /// @fixme: when unit is equal to one, this does not work, we need to rely on
 276   /// effective dimension then actual numbers here. For now, some heuristics
 277   /// added to infer what would be fan_in/fan_out
 278   if (dim.batch() * dim.channel() * dim.height() == 1) {
 279     fan_out = fan_in = dim.width();
 280   } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
 281     fan_in = dim.height();
 282     fan_out = dim.width();
 283   } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
 284     auto field_size = dim.height() * dim.width();
 285
 286     // this also handles below cases.
 287     // 1. fan_in = fan_out = 1 as well.
 288     // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
 289     fan_in = dim.channel() * field_size;
 290     fan_out = dim.batch() * field_size;
 291   }
 292
 293   switch (initializer) {
 294   case Tensor::Initializer::ZEROS:
 295     setZero();
 296     break;
 297   case Tensor::Initializer::ONES:
 298     setValue(1.0f);
 299     break;
 300   case Tensor::Initializer::LECUN_NORMAL:
 301     setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
 302     break;
 303   case Tensor::Initializer::XAVIER_NORMAL:
 304     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
 305     break;
 306   case Tensor::Initializer::HE_NORMAL:
 307     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
 308     break;
 309   case Tensor::Initializer::LECUN_UNIFORM:
 310     setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
 311     break;
 312   case Tensor::Initializer::XAVIER_UNIFORM:
 313     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
 314                    sqrtFloat(6.0 / (fan_in + fan_out)));
 315     break;
 316   case Tensor::Initializer::HE_UNIFORM:
 317     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
 318                    sqrtFloat(6.0 / (fan_in)));
 319     break;
 320   default:
 321     break;
 322   }
 323
 324   putData();
 325 }
 326
 327 int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
 328   try {
 329     this->multiply_strided(m, *this, beta);
 330   } catch (std::exception &err) {
 331     ml_loge("%s %s", typeid(err).name(), err.what());
 332     return ML_ERROR_INVALID_PARAMETER;
 333   }
 334
 335   return ML_ERROR_NONE;
 336 }
 337
 338 Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
 339   Tensor t;
 340   return this->multiply_strided(m, t, beta);
 341 }
 342
 343 Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
 344                                  const float beta) const {
 345   /** TODO: throw than create new dimenions */
 346   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 347
 348   if (size() != m.size() || size() != output.size())
 349     throw std::invalid_argument(
 350       "Strided multiplication does not support broadcasting");
 351
 352   if (getDataType() == Tdatatype::FP32) {
 353     NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
 354       << getName() << " is not allocated";
 355     NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
 356       << m.getName() << " is not allocated";
 357     NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
 358       << output.getName() << " is not allocated";
 359   } else if (getDataType() == Tdatatype::FP16) {
 360 #ifdef ENABLE_FP16
 361     NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
 362       << getName() << " is not allocated";
 363     NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
 364       << m.getName() << " is not allocated";
 365     NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
 366       << output.getName() << " is not allocated";
 367 #else
 368     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 369 #endif
 370   }
 371
 372   // Format NCHW Case
 373   if (this->getFormat() == Tformat::NCHW) {
 374     if (getDataType() == Tdatatype::FP32) {
 375       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 376           beta != 0.0) {
 377         for (unsigned int b = 0; b < batch(); ++b) {
 378           for (unsigned int c = 0; c < channel(); ++c) {
 379             for (unsigned int h = 0; h < height(); ++h) {
 380               for (unsigned int w = 0; w < width(); ++w) {
 381                 output.addValue(b, c, h, w,
 382                                 getValue<float>(b, c, h, w) *
 383                                   m.getValue<float>(b, c, h, w),
 384                                 beta);
 385               }
 386             }
 387           }
 388         }
 389       } else {
 390         /** @todo optimize this with combining these loops where stride is 1
 391          */
 392         for (unsigned int b = 0; b < batch(); ++b) {
 393           for (unsigned int c = 0; c < channel(); ++c) {
 394             for (unsigned int h = 0; h < height(); ++h) {
 395               float *out_data = output.getAddress<float>(b, c, h, 0);
 396               const float *m_data = m.getAddress<float>(b, c, h, 0);
 397               const float *in_data = getAddress<float>(b, c, h, 0);
 398               std::transform(in_data, in_data + width(), m_data, out_data,
 399                              std::multiplies<float>());
 400             }
 401           }
 402         }
 403       }
 404     } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 405 #ifdef ENABLE_FP16
 406       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 407           beta != 0.0) {
 408         for (unsigned int b = 0; b < batch(); ++b) {
 409           for (unsigned int c = 0; c < channel(); ++c) {
 410             for (unsigned int h = 0; h < height(); ++h) {
 411               for (unsigned int w = 0; w < width(); ++w) {
 412                 output.addValue(b, c, h, w,
 413                                 getValue<__fp16>(b, c, h, w) *
 414                                   m.getValue<__fp16>(b, c, h, w),
 415                                 beta);
 416               }
 417             }
 418           }
 419         }
 420       } else {
 421         for (unsigned int b = 0; b < batch(); ++b) {
 422           for (unsigned int c = 0; c < channel(); ++c) {
 423             for (unsigned int h = 0; h < height(); ++h) {
 424               __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
 425               const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
 426               const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
 427               std::transform(in_data, in_data + width(), m_data, out_data,
 428                              std::multiplies<__fp16>());
 429             }
 430           }
 431         }
 432       }
 433 #else
 434       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 435 #endif
 436     }
 437   } else { // Format NHWC Case
 438     if (getDataType() == Tdatatype::FP32) {
 439       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 440           beta != 0.0) {
 441         for (unsigned int b = 0; b < batch(); ++b) {
 442           for (unsigned int h = 0; h < height(); ++h) {
 443             for (unsigned int w = 0; w < width(); ++w) {
 444               for (unsigned int c = 0; c < channel(); ++c) {
 445                 output.addValue(b, c, h, w,
 446                                 getValue<float>(b, c, h, w) *
 447                                   m.getValue<float>(b, c, h, w),
 448                                 beta);
 449               }
 450             }
 451           }
 452         }
 453       } else {
 454         /** @todo optimize this with combining these loops where
 455          * stride is 1 */
 456         for (unsigned int b = 0; b < batch(); ++b) {
 457           for (unsigned int h = 0; h < height(); ++h) {
 458             for (unsigned int w = 0; w < width(); ++w) {
 459               float *out_data = output.getAddress<float>(b, 0, h, w);
 460               const float *m_data = m.getAddress<float>(b, 0, h, w);
 461               const float *in_data = getAddress<float>(b, 0, h, w);
 462               std::transform(in_data, in_data + channel(), m_data, out_data,
 463                              std::multiplies<float>());
 464             }
 465           }
 466         }
 467       }
 468     } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
 469 #ifdef ENABLE_FP16
 470       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 471           beta != 0.0) {
 472         for (unsigned int b = 0; b < batch(); ++b) {
 473           for (unsigned int h = 0; h < height(); ++h) {
 474             for (unsigned int w = 0; w < width(); ++w) {
 475               for (unsigned int c = 0; c < channel(); ++c) {
 476                 output.addValue(b, c, h, w,
 477                                 getValue<__fp16>(b, c, h, w) *
 478                                   m.getValue<__fp16>(b, c, h, w),
 479                                 beta);
 480               }
 481             }
 482           }
 483         }
 484       } else {
 485         /** @todo optimize this with combining these loops where
 486          * stride is 1 */
 487         for (unsigned int b = 0; b < batch(); ++b) {
 488           for (unsigned int h = 0; h < height(); ++h) {
 489             for (unsigned int w = 0; w < width(); ++w) {
 490               __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
 491               const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
 492               const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
 493               std::transform(in_data, in_data + channel(), m_data, out_data,
 494                              std::multiplies<__fp16>());
 495             }
 496           }
 497         }
 498       }
 499 #else
 500       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 501 #endif
 502     }
 503   }
 504
 505   return output;
 506 }
 507
 508 int Tensor::add_i_strided(Tensor const &m, const float beta) {
 509   try {
 510     this->add_strided(m, *this, beta);
 511   } catch (std::exception &err) {
 512     ml_loge("%s %s", typeid(err).name(), err.what());
 513     return ML_ERROR_INVALID_PARAMETER;
 514   }
 515
 516   return ML_ERROR_NONE;
 517 }
 518
 519 Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
 520   Tensor t;
 521   return this->add_strided(m, t, beta);
 522 }
 523
 524 Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
 525                             const float beta) const {
 526   /** TODO: throw than create new dimenions */
 527   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 528
 529   if (size() != m.size() || size() != output.size())
 530     throw std::invalid_argument(
 531       "Strided addition does not support broadcasting");
 532
 533   if (getDataType() == Tdatatype::FP32) {
 534     NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
 535       << getName() << " is not allocated";
 536     NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
 537       << m.getName() << " is not allocated";
 538     NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
 539       << output.getName() << " is not allocated";
 540   } else if (getDataType() == Tdatatype::FP16) {
 541 #ifdef ENABLE_FP16
 542     NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
 543       << getName() << " is not allocated";
 544     NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
 545       << m.getName() << " is not allocated";
 546     NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
 547       << output.getName() << " is not allocated";
 548 #else
 549     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 550 #endif
 551   }
 552
 553   // Format NCHW Case
 554   if (this->getFormat() == Tformat::NCHW) {
 555     if (getDataType() == Tdatatype::FP32) {
 556       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 557           beta != 0.0) {
 558         for (unsigned int b = 0; b < batch(); ++b) {
 559           for (unsigned int c = 0; c < channel(); ++c) {
 560             for (unsigned int h = 0; h < height(); ++h) {
 561               for (unsigned int w = 0; w < width(); ++w) {
 562                 output.setValue(b, c, h, w,
 563                                 getValue<float>(b, c, h, w) +
 564                                   m.getValue<float>(b, c, h, w) * beta);
 565               }
 566             }
 567           }
 568         }
 569       } else {
 570         /** @todo optimize this with combining these loops where stride is 1 */
 571         for (unsigned int b = 0; b < batch(); ++b) {
 572           for (unsigned int c = 0; c < channel(); ++c) {
 573             for (unsigned int h = 0; h < height(); ++h) {
 574               float *out_data = output.getAddress<float>(b, c, h, 0);
 575               const float *m_data = m.getAddress<float>(b, c, h, 0);
 576               const float *in_data = getAddress<float>(b, c, h, 0);
 577               std::transform(in_data, in_data + width(), m_data, out_data,
 578                              std::plus<float>());
 579             }
 580           }
 581         }
 582       }
 583     } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 584 #ifdef ENABLE_FP16
 585       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 586           beta != 0.0) {
 587         for (unsigned int b = 0; b < batch(); ++b) {
 588           for (unsigned int c = 0; c < channel(); ++c) {
 589             for (unsigned int h = 0; h < height(); ++h) {
 590               for (unsigned int w = 0; w < width(); ++w) {
 591                 output.setValue(b, c, h, w,
 592                                 getValue<__fp16>(b, c, h, w) +
 593                                   m.getValue<__fp16>(b, c, h, w) * beta);
 594               }
 595             }
 596           }
 597         }
 598       } else {
 599         for (unsigned int b = 0; b < batch(); ++b) {
 600           for (unsigned int c = 0; c < channel(); ++c) {
 601             for (unsigned int h = 0; h < height(); ++h) {
 602               __fp16 *out_data = output.getAddress<__fp16>(b, c, h, 0);
 603               const __fp16 *m_data = m.getAddress<__fp16>(b, c, h, 0);
 604               const __fp16 *in_data = getAddress<__fp16>(b, c, h, 0);
 605               std::transform(in_data, in_data + width(), m_data, out_data,
 606                              std::plus<__fp16>());
 607             }
 608           }
 609         }
 610       }
 611 #else
 612       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 613 #endif
 614     }
 615   } else { // Format NHWC Case
 616     if (getDataType() == Tdatatype::FP32) {
 617       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 618           beta != 0.0) {
 619         for (unsigned int b = 0; b < batch(); ++b) {
 620           for (unsigned int h = 0; h < height(); ++h) {
 621             for (unsigned int w = 0; w < width(); ++w) {
 622               for (unsigned int c = 0; c < channel(); ++c) {
 623                 output.setValue(b, c, h, w,
 624                                 getValue<float>(b, c, h, w) +
 625                                   m.getValue<float>(b, c, h, w) * beta);
 626               }
 627             }
 628           }
 629         }
 630       } else {
 631         /** @todo optimize this with combining these loops where
 632          * stride is 1 */
 633         for (unsigned int b = 0; b < batch(); ++b) {
 634           for (unsigned int h = 0; h < height(); ++h) {
 635             for (unsigned int w = 0; w < width(); ++w) {
 636               float *out_data = output.getAddress<float>(b, 0, h, w);
 637               const float *m_data = m.getAddress<float>(b, 0, h, w);
 638               const float *in_data = getAddress<float>(b, 0, h, w);
 639               std::transform(in_data, in_data + channel(), m_data, out_data,
 640                              std::plus<float>());
 641             }
 642           }
 643         }
 644       }
 645     } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
 646 #ifdef ENABLE_FP16
 647       if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 648           beta != 0.0) {
 649         for (unsigned int b = 0; b < batch(); ++b) {
 650           for (unsigned int h = 0; h < height(); ++h) {
 651             for (unsigned int w = 0; w < width(); ++w) {
 652               for (unsigned int c = 0; c < channel(); ++c) {
 653                 output.setValue(b, c, h, w,
 654                                 getValue<__fp16>(b, c, h, w) +
 655                                   m.getValue<__fp16>(b, c, h, w) * beta);
 656               }
 657             }
 658           }
 659         }
 660       } else {
 661         /** @todo optimize this with combining these loops where
 662          * stride is 1 */
 663         for (unsigned int b = 0; b < batch(); ++b) {
 664           for (unsigned int h = 0; h < height(); ++h) {
 665             for (unsigned int w = 0; w < width(); ++w) {
 666               __fp16 *out_data = output.getAddress<__fp16>(b, 0, h, w);
 667               const __fp16 *m_data = m.getAddress<__fp16>(b, 0, h, w);
 668               const __fp16 *in_data = getAddress<__fp16>(b, 0, h, w);
 669               std::transform(in_data, in_data + channel(), m_data, out_data,
 670                              std::plus<__fp16>());
 671             }
 672           }
 673         }
 674       }
 675 #else
 676       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 677 #endif
 678     }
 679   }
 680   return output;
 681 }
 682
 683 int Tensor::multiply_i(float const &value) {
 684   NNTR_THROW_IF(!contiguous, std::invalid_argument)
 685     << getName() << " is not contiguous, cannot multiply";
 686
 687   /// @note this is not depending on multiply_i as there is an optimized
 688   /// version for multiply_i
 689   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 690     float *data = getData<float>();
 691     unsigned int len = size();
 692
 693     sscal(len, value, data, 1);
 694   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 695 #ifdef ENABLE_FP16
 696     __fp16 *data = getData<__fp16>();
 697     unsigned int len = size();
 698     sscal(len, value, data, 1);
 699 #else
 700     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 701 #endif
 702   }
 703   return ML_ERROR_NONE;
 704 }
 705
 706 Tensor Tensor::multiply(float const &value) const {
 707   Tensor t;
 708   return multiply(value, t);
 709 }
 710
 711 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
 712   /// @todo add unittest
 713   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 714     auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
 715     return apply(f, out);
 716   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 717 #ifdef ENABLE_FP16
 718     auto f = std::bind(std::multiplies<__fp16>(), std::placeholders::_1, value);
 719     return apply(f, out);
 720 #else
 721     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 722 #endif
 723   }
 724   return out;
 725 }
 726
 727 int Tensor::multiply_i(Tensor const &m, const float beta) {
 728   try {
 729     this->multiply(m, *this, beta);
 730   } catch (std::exception &err) {
 731     ml_loge("%s %s", typeid(err).name(), err.what());
 732     return ML_ERROR_INVALID_PARAMETER;
 733   }
 734
 735   return ML_ERROR_NONE;
 736 }
 737
 738 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
 739   Tensor t("", this->getFormat());
 740   return this->multiply(m, t, beta);
 741 }
 742
 743 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
 744                          const float beta) const {
 745   /**
 746    * @note this does not work correctly with differently strided inputs.
 747    * Use multiply_strided alternatively
 748    */
 749   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 750     auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 751                  float *out_buf) {
 752       if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
 753           beta == 0.0) {
 754         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 755                        std::multiplies<float>());
 756       } else {
 757         for (unsigned int i = 0; i < e.buffer_size; ++i) {
 758           *out_buf = *buf * *m_buf + beta * *out_buf;
 759           buf += strides[3];
 760           m_buf += e.strides[3];
 761           out_buf += output.strides[3];
 762         }
 763       }
 764     };
 765
 766     NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
 767       << "Tensor Format of " << getName() << ":"
 768       << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
 769       << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
 770
 771     NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 772                   std::invalid_argument)
 773       << getName() << " is not contiguous, cannot multiply";
 774
 775     NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 776                   std::invalid_argument)
 777       << getName() << " is not contiguous, cannot multiply";
 778
 779     apply_broadcast(m, f, output);
 780     return output;
 781
 782   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 783 #ifdef ENABLE_FP16
 784     auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
 785                  __fp16 *out_buf) {
 786       if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
 787           beta == 0.0) {
 788         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 789                        std::multiplies<__fp16>());
 790       } else {
 791         for (unsigned int i = 0; i < e.buffer_size; ++i) {
 792           *out_buf = *buf * *m_buf + beta * *out_buf;
 793           buf += strides[3];
 794           m_buf += e.strides[3];
 795           out_buf += output.strides[3];
 796         }
 797       }
 798     };
 799
 800     NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
 801       << "Tensor Format of " << getName() << ":"
 802       << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
 803       << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
 804
 805     NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 806                   std::invalid_argument)
 807       << getName() << " is not contiguous, cannot multiply";
 808
 809     apply_broadcast(m, f, output);
 810     return output;
 811 #else
 812     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 813 #endif
 814   }
 815   return output;
 816 }
 817
 818 int Tensor::divide_i(float const &value) {
 819   if (value == 0.0f) {
 820     return ML_ERROR_INVALID_PARAMETER;
 821   }
 822   this->divide(value, *this);
 823   return ML_ERROR_NONE;
 824 }
 825
 826 Tensor Tensor::divide(float const &value) const {
 827   Tensor t;
 828   return divide(value, t);
 829 }
 830
 831 Tensor &Tensor::divide(float const &value, Tensor &out) const {
 832   auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
 833   /// @todo add unittest, __fp16 ZeroDivisionError
 834   if (value == 0.0f) {
 835     std::stringstream ss;
 836     ss << "[Tensor] divide by value failed, value: " << value;
 837     throw std::invalid_argument(ss.str().c_str());
 838   }
 839   return apply(f, out);
 840 }
 841
 842 int Tensor::divide_i(Tensor const &m) {
 843   try {
 844     this->divide(m, *this);
 845   } catch (std::exception &err) {
 846     ml_loge("%s %s", typeid(err).name(), err.what());
 847     return ML_ERROR_INVALID_PARAMETER;
 848   }
 849
 850   return ML_ERROR_NONE;
 851 }
 852
 853 Tensor Tensor::divide(Tensor const &m) const {
 854   Tensor t;
 855   return this->divide(m, t);
 856 }
 857
 858 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
 859   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
 860     auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 861                  float *out_buf) {
 862       if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
 863         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 864                        std::divides<float>());
 865       } else {
 866         for (unsigned int i = 0; i < e.buffer_size; ++i) {
 867           *out_buf = *buf / *m_buf;
 868           buf += strides[3];
 869           m_buf += e.strides[3];
 870           out_buf += output.strides[3];
 871         }
 872       }
 873     };
 874
 875     NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 876                   std::invalid_argument)
 877       << getName() << " is not contiguous, cannot divide";
 878
 879     apply_broadcast(m, f, output);
 880   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
 881 #ifdef ENABLE_FP16
 882     auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
 883                  __fp16 *out_buf) {
 884       if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
 885         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 886                        std::divides<__fp16>());
 887       } else {
 888         for (unsigned int i = 0; i < e.buffer_size; ++i) {
 889           *out_buf = *buf / *m_buf;
 890           buf += strides[3];
 891           m_buf += e.strides[3];
 892           out_buf += output.strides[3];
 893         }
 894       }
 895     };
 896
 897     NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 898                   std::invalid_argument)
 899       << getName() << " is not contiguous, cannot divide";
 900
 901     apply_broadcast(m, f, output);
 902 #else
 903     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 904 #endif
 905   }
 906   return output;
 907 }
 908
 909 int Tensor::add_i(float const &value) {
 910   this->add(value, *this);
 911   return ML_ERROR_NONE;
 912 }
 913
 914 Tensor Tensor::add(float const &value) const {
 915   Tensor t;
 916   return add(value, t);
 917 }
 918
 919 Tensor &Tensor::add(float const &value, Tensor &out) const {
 920   /// @todo add unittest
 921   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 922     auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
 923     return apply(f, out);
 924   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 925 #ifdef ENABLE_FP16
 926     auto f = std::bind(std::plus<__fp16>(), std::placeholders::_1, value);
 927     return apply(f, out);
 928 #else
 929     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 930 #endif
 931   }
 932   return out;
 933 }
 934
 935 int Tensor::add_i(Tensor const &m, float const alpha) {
 936   /// @todo: add axis rather doing add over the last two dimensions always
 937   /// operator i has optimized version
 938   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 939     auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 940                  float *out_buf) {
 941       saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
 942     };
 943
 944     /// @todo: enable this after add_strided supports broadcast
 945     // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
 946     //   << getName() << " is not contiguous, cannot add";
 947
 948     try {
 949       apply_broadcast(m, f, *this);
 950     } catch (std::exception &err) {
 951       ml_loge("%s %s", typeid(err).name(), err.what());
 952       return ML_ERROR_INVALID_PARAMETER;
 953     }
 954
 955   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
 956 #ifdef ENABLE_FP16
 957     auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
 958                  __fp16 *out_buf) {
 959       saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
 960       /// @todo: saxpy is not valid for __fp16
 961     };
 962
 963     /// @todo: enable this after add_strided supports broadcast
 964     // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
 965     //   << getName() << " is not contiguous, cannot add";
 966
 967     try {
 968       apply_broadcast(m, f, *this);
 969     } catch (std::exception &err) {
 970       ml_loge("%s %s", typeid(err).name(), err.what());
 971       return ML_ERROR_INVALID_PARAMETER;
 972     }
 973
 974 #else
 975     ml_loge("%s", "Error: enable-fp16 is not enabled");
 976     return ML_ERROR_INVALID_PARAMETER;
 977 #endif
 978   }
 979   return ML_ERROR_NONE;
 980 }
 981
 982 Tensor Tensor::add(Tensor const &m, float const alpha) const {
 983   Tensor t;
 984   return this->add(m, t, alpha);
 985 }
 986
 987 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
 988   NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 989                 std::invalid_argument)
 990     << getName() << " is not contiguous, cannot add";
 991
 992   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
 993     auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 994                  float *out_buf) {
 995       if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
 996           alpha == 0) {
 997         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 998                        std::plus<float>());
 999       } else {
1000         for (unsigned int i = 0; i < e.buffer_size; ++i) {
1001           *out_buf = *buf + *m_buf * alpha;
1002           buf += strides[3];
1003           m_buf += e.strides[3];
1004           out_buf += strides[3];
1005         }
1006       }
1007     };
1008     apply_broadcast(m, f, output);
1009   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1010 #ifdef ENABLE_FP16
1011     auto f = [&](const BroadcastInfo &e, const __fp16 *buf, const __fp16 *m_buf,
1012                  __fp16 *out_buf) {
1013       if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
1014           alpha == 0) {
1015         std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
1016                        std::plus<__fp16>());
1017       } else {
1018         for (unsigned int i = 0; i < e.buffer_size; ++i) {
1019           *out_buf = *buf + *m_buf * alpha;
1020           buf += strides[3];
1021           m_buf += e.strides[3];
1022           out_buf += strides[3];
1023         }
1024       }
1025     };
1026     apply_broadcast(m, f, output);
1027 #else
1028     throw std::invalid_argument("Error: enable-fp16 is not enabled");
1029 #endif
1030   }
1031   return output;
1032 }
1033
1034 int Tensor::subtract_i(float const &value) {
1035   this->subtract(value, *this);
1036   return ML_ERROR_NONE;
1037 }
1038
1039 Tensor Tensor::subtract(float const &value) const {
1040   Tensor t;
1041   return subtract(value, t);
1042 }
1043
1044 Tensor &Tensor::subtract(float const &value, Tensor &out) const {
1045   /// @todo add unittest
1046   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1047     auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
1048     return apply(f, out);
1049   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1050 #ifdef ENABLE_FP16
1051     auto f = std::bind(std::minus<__fp16>(), std::placeholders::_1, value);
1052     return apply(f, out);
1053 #else
1054     ml_loge("%s", "Error: enable-fp16 is not enabled");
1055 #endif
1056   }
1057   return out; // shouldn't reach
1058 }
1059
1060 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
1061
1062 Tensor Tensor::subtract(Tensor const &m) const { return add(m, -1); }
1063
1064 Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
1065   return add(m, out, -1);
1066 }
1067
1068 int Tensor::pow_i(float exponent) {
1069   pow(exponent, *this);
1070   return ML_ERROR_NONE;
1071 }
1072
1073 Tensor Tensor::pow(float exponent) const {
1074   Tensor t;
1075   return pow(exponent, t);
1076 }
1077
1078 Tensor &Tensor::pow(float exponent, Tensor &out) const {
1079   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1080     auto f = [exponent](float in) { return powf(in, exponent); };
1081     return apply(f, out);
1082   }
1083   if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1084 #ifdef ENABLE_FP16
1085     auto f = [exponent](__fp16 in) { return powf(in, exponent); };
1086     return apply(f, out);
1087 #else
1088     ml_loge("%s", "Error: enable-fp16 is not enabled");
1089 #endif
1090   }
1091   return out;
1092 }
1093
1094 Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
1095   TensorDim dim_ = dim;
1096   dim_.batch(size);
1097
1098   return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
1099 }
1100
1101 void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
1102                                     size_t offset) {
1103   /**
1104    * - If src already has data allocaed, then directly make dest tensor based on
1105    * the src tensor.
1106    * - If src.data does not exist (meaning tensor does not memory allocated),
1107    * and src.src_tensor does not exist (meaning the src tensor does not depened
1108    * on another tensor), then create a SrcSharedTensor around the src.
1109    * - If src.src_tensor exists, then use the src.src_tensor to create the
1110    *  required SrcSharedTensor to avoid recursive dependency.
1111    *
1112    * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
1113    * if the batch size of src is updated and needs reallocation.
1114    */
1115   dest.data = nullptr;
1116   if (src.data) {
1117     dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1118     dest.allocate();
1119   } else if (!src.src_tensor)
1120     dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
1121   else
1122     dest.src_tensor = std::make_shared<SrcSharedTensor>(
1123       src.src_tensor->tensor(), offset + src.src_tensor->offset());
1124 }
1125
1126 Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
1127                                    bool reset_stride,
1128                                    const std::string &name_) const {
1129   Tensor ret = *this;
1130   if (dim_.getFormat() != ret.dim.getFormat())
1131     throw std::invalid_argument("Tensor format does not match");
1132
1133   ret.dim = dim_;
1134   if (!name_.empty())
1135     ret.name = name_;
1136
1137   if (dim_.getDataLen() + offset > dim.getDataLen())
1138     throw std::invalid_argument(
1139       "Creating shared tensor of size bigger than tensor memory.");
1140
1141   if (reset_stride)
1142     ret.strides = ret.dim.computeStrides();
1143
1144   TensorDim new_match_dim = dim_;
1145   new_match_dim.batch(dim.batch());
1146   if (new_match_dim != dim && !reset_stride)
1147     ret.contiguous = false;
1148
1149   /**
1150    * In this case, its the caller's responsibility to ensure that allocate() is
1151    * called for the output tensor before operating on the output tensor.
1152    */
1153   createSharedDataTensor(*this, ret, offset);
1154
1155   return ret;
1156 }
1157
1158 std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
1159   NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1160     << "num size cannot be zero";
1161
1162   if (axis == -1) {
1163     axis = 3;
1164   }
1165
1166   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1167     << "cannot split axis of axis: " << axis;
1168
1169   NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
1170     << "axis is not divisible by num_size, axis: " << axis
1171     << " num size: " << num_size;
1172
1173   std::vector<size_t> sizes;
1174   sizes.resize(num_size);
1175
1176   unsigned int sz = dim.getTensorDim(axis) / num_size;
1177   std::fill(sizes.begin(), sizes.end(), sz);
1178
1179   return split(sizes, axis);
1180 }
1181
1182 std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
1183   size_t num_size = sizes.size();
1184
1185   NNTR_THROW_IF(num_size == 0, std::invalid_argument)
1186     << "num size cannot be zero";
1187
1188   if (axis == -1) {
1189     axis = 3;
1190   }
1191
1192   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1193     << "cannot split axis of axis: " << axis;
1194
1195   NNTR_THROW_IF(
1196     std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
1197     std::invalid_argument)
1198     << "among given sizes at least one of size is 0";
1199
1200   size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
1201   NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
1202     << "given sum of sizes did not match with origin tensor dim, tensor dim: "
1203     << dim.getTensorDim(axis) << " total size: " << total_size;
1204
1205   std::vector<TensorDim> ret_dims;
1206   ret_dims.reserve(num_size);
1207   for (unsigned int i = 0; i < num_size; ++i) {
1208     ret_dims[i] = dim;
1209     ret_dims[i].setTensorDim(axis, sizes[i]);
1210   }
1211
1212   bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
1213   std::vector<Tensor> ret;
1214
1215   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1216     auto iter_value = [this, is_format_nchw](
1217                         std::array<size_t, 4> &loc,
1218                         const std::array<size_t, 4> &end_loc,
1219                         const std::array<size_t, 4> &reset_dim_arr) -> float & {
1220       auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
1221                                      : getValue(loc[0], loc[3], loc[1], loc[2]);
1222       for (int i = 3; i >= 0; --i) {
1223         loc[i]++;
1224         if (loc[i] == end_loc[i]) {
1225           loc[i] -= reset_dim_arr[i];
1226           continue;
1227         }
1228         break;
1229       }
1230       return value;
1231     };
1232
1233     ret.reserve(num_size);
1234
1235     unsigned int accumulated_size = 0;
1236     for (unsigned int i = 0; i < num_size; ++i) {
1237       std::array<size_t, 4> loc = {0, 0, 0, 0};
1238
1239       if (is_format_nchw) {
1240         loc[axis] += accumulated_size;
1241       } else {
1242         if (axis == 0) {
1243           loc[0] += accumulated_size;
1244         } else if (axis == 1) {
1245           loc[3] += accumulated_size;
1246         } else if (axis == 2 || axis == 3) {
1247           loc[axis - 1] += accumulated_size;
1248         }
1249       }
1250
1251       ret.emplace_back(ret_dims[i]);
1252       auto &ret_t = ret.back();
1253
1254       std::array<size_t, 4> end_loc;
1255
1256       if (is_format_nchw) {
1257         end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1258                    ret_dims[i].height(), ret_dims[i].width()};
1259       } else {
1260         end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1261                    ret_dims[i].width(), ret_dims[i].channel()};
1262       }
1263
1264       accumulated_size += sizes[i];
1265
1266       if (is_format_nchw) {
1267         end_loc[axis] = accumulated_size;
1268       } else {
1269         if (axis == 0) {
1270           end_loc[0] = accumulated_size;
1271         } else if (axis == 1) {
1272           end_loc[3] = accumulated_size;
1273         } else if (axis == 2 || axis == 3) {
1274           end_loc[axis - 1] = accumulated_size;
1275         }
1276       }
1277
1278       std::array<size_t, 4> reset_dim_arr;
1279       if (is_format_nchw) {
1280         reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1281                          ret_dims[i].height(), ret_dims[i].width()};
1282       } else {
1283         reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1284                          ret_dims[i].width(), ret_dims[i].channel()};
1285       }
1286
1287       ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1288         return iter_value(loc, end_loc, reset_dim_arr);
1289       });
1290     }
1291   }
1292   if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1293 #ifdef ENABLE_FP16
1294     auto iter_value =
1295       [this, is_format_nchw](
1296         std::array<size_t, 4> &loc, const std::array<size_t, 4> &end_loc,
1297         const std::array<size_t, 4> &reset_dim_arr) -> __fp16 & {
1298       auto &value = (is_format_nchw)
1299                       ? getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1300                       : getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1301       for (int i = 3; i >= 0; --i) {
1302         loc[i]++;
1303         if (loc[i] == end_loc[i]) {
1304           loc[i] -= reset_dim_arr[i];
1305           continue;
1306         }
1307         break;
1308       }
1309       return value;
1310     };
1311
1312     ret.reserve(num_size);
1313
1314     unsigned int accumulated_size = 0;
1315     for (unsigned int i = 0; i < num_size; ++i) {
1316       std::array<size_t, 4> loc = {0, 0, 0, 0};
1317
1318       if (is_format_nchw) {
1319         loc[axis] += accumulated_size;
1320       } else {
1321         if (axis == 0) {
1322           loc[0] += accumulated_size;
1323         } else if (axis == 1) {
1324           loc[3] += accumulated_size;
1325         } else if (axis == 2 || axis == 3) {
1326           loc[axis - 1] += accumulated_size;
1327         }
1328       }
1329
1330       ret.emplace_back(ret_dims[i]);
1331       auto &ret_t = ret.back();
1332
1333       std::array<size_t, 4> end_loc;
1334
1335       if (is_format_nchw) {
1336         end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1337                    ret_dims[i].height(), ret_dims[i].width()};
1338       } else {
1339         end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
1340                    ret_dims[i].width(), ret_dims[i].channel()};
1341       }
1342
1343       accumulated_size += sizes[i];
1344
1345       if (is_format_nchw) {
1346         end_loc[axis] = accumulated_size;
1347       } else {
1348         if (axis == 0) {
1349           end_loc[0] = accumulated_size;
1350         } else if (axis == 1) {
1351           end_loc[3] = accumulated_size;
1352         } else if (axis == 2 || axis == 3) {
1353           end_loc[axis - 1] = accumulated_size;
1354         }
1355       }
1356
1357       std::array<size_t, 4> reset_dim_arr;
1358       if (is_format_nchw) {
1359         reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1360                          ret_dims[i].height(), ret_dims[i].width()};
1361       } else {
1362         reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1363                          ret_dims[i].width(), ret_dims[i].channel()};
1364       }
1365
1366       ret_t.apply_i([&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1367         return iter_value(loc, end_loc, reset_dim_arr);
1368       });
1369     }
1370
1371 #else
1372     throw std::invalid_argument("Error: enable-fp16 is not enabled");
1373 #endif
1374   }
1375
1376   return ret;
1377 }
1378
1379 Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
1380
1381   if (axis == -1) {
1382     axis = 3;
1383   }
1384
1385   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
1386     << "cannot split axis of axis: " << axis;
1387
1388   NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
1389     << "given tensor vector is empty";
1390
1391   Tensor ret;
1392   auto ref_dim = tensors.front().getDim();
1393   bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
1394   ref_dim.setTensorDim(axis, 1);
1395   NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
1396                              [&ref_dim, axis](const Tensor &t) {
1397                                auto cur_dim = t.getDim();
1398                                cur_dim.setTensorDim(axis, 1);
1399                                return ref_dim == cur_dim;
1400                              }),
1401                 std::invalid_argument)
1402     << " all tensor must have the same dimension except for the axis, ref_dim: "
1403     << ref_dim << " axis : " << axis;
1404
1405   auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
1406                                   [axis](unsigned cur, const Tensor &t) {
1407                                     return cur += t.getDim().getTensorDim(axis);
1408                                   });
1409   if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
1410     auto iter_value =
1411       [is_format_nchw](std::array<unsigned, 4> &loc,
1412                        const std::array<unsigned, 4> &start_loc, Tensor &t,
1413                        const std::array<unsigned, 4> &ref_dim_arr) -> float & {
1414       auto &value = is_format_nchw
1415                       ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
1416                       : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
1417
1418       for (int i = 3; i >= 0; --i) {
1419         loc[i]++;
1420         if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1421           loc[i] = start_loc[i];
1422           continue;
1423         }
1424         break;
1425       }
1426       return value;
1427     };
1428
1429     auto ret_dim = ref_dim;
1430     ret_dim.setTensorDim(axis, axis_dim);
1431
1432     ret = Tensor(ret_dim);
1433
1434     std::array<unsigned, 4> loc = {0, 0, 0, 0};
1435     for (auto &t : tensors) {
1436       std::array<unsigned, 4> start_loc = loc;
1437       std::array<unsigned, 4> tensor_dim_arr;
1438       if (is_format_nchw) {
1439         tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1440         tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1441         tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1442         tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1443       } else {
1444         tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1445         tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1446         tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1447         tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1448       }
1449
1450       for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1451         iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
1452       }
1453
1454       if (is_format_nchw) {
1455         loc[axis] += t.getDim().getTensorDim(axis);
1456       } else {
1457         if (axis == 0) {
1458           loc[0] += t.getDim().getTensorDim(axis);
1459         } else if (axis == 1) {
1460           loc[3] += t.getDim().getTensorDim(axis);
1461         } else if (axis == 2 || axis == 3) {
1462           loc[axis - 1] += t.getDim().getTensorDim(axis);
1463         }
1464       }
1465     }
1466
1467     // return ret;
1468   } else if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
1469 #ifdef ENABLE_FP16
1470     auto iter_value =
1471       [is_format_nchw](std::array<unsigned, 4> &loc,
1472                        const std::array<unsigned, 4> &start_loc, Tensor &t,
1473                        const std::array<unsigned, 4> &ref_dim_arr) -> __fp16 & {
1474       auto &value = is_format_nchw
1475                       ? t.getValue<__fp16>(loc[0], loc[1], loc[2], loc[3])
1476                       : t.getValue<__fp16>(loc[0], loc[3], loc[1], loc[2]);
1477
1478       for (int i = 3; i >= 0; --i) {
1479         loc[i]++;
1480         if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1481           loc[i] = start_loc[i];
1482           continue;
1483         }
1484         break;
1485       }
1486       return value;
1487     };
1488
1489     auto ret_dim = ref_dim;
1490     ret_dim.setTensorDim(axis, axis_dim);
1491
1492     ret = Tensor(ret_dim);
1493
1494     std::array<unsigned, 4> loc = {0, 0, 0, 0};
1495     for (auto &t : tensors) {
1496       std::array<unsigned, 4> start_loc = loc;
1497       std::array<unsigned, 4> tensor_dim_arr;
1498       if (is_format_nchw) {
1499         tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1500         tensor_dim_arr[1] = t.getDim().getTensorDim(1);
1501         tensor_dim_arr[2] = t.getDim().getTensorDim(2);
1502         tensor_dim_arr[3] = t.getDim().getTensorDim(3);
1503       } else {
1504         tensor_dim_arr[0] = t.getDim().getTensorDim(0);
1505         tensor_dim_arr[1] = t.getDim().getTensorDim(2);
1506         tensor_dim_arr[2] = t.getDim().getTensorDim(3);
1507         tensor_dim_arr[3] = t.getDim().getTensorDim(1);
1508       }
1509
1510       for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1511         iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<__fp16>(i);
1512       }
1513
1514       if (is_format_nchw) {
1515         loc[axis] += t.getDim().getTensorDim(axis);
1516       } else {
1517         if (axis == 0) {
1518           loc[0] += t.getDim().getTensorDim(axis);
1519         } else if (axis == 1) {
1520           loc[3] += t.getDim().getTensorDim(axis);
1521         } else if (axis == 2 || axis == 3) {
1522           loc[axis - 1] += t.getDim().getTensorDim(axis);
1523         }
1524       }
1525     }
1526
1527 #else
1528     throw std::invalid_argument("Error: enable-fp16 is not enabled");
1529 #endif
1530   }
1531   return ret;
1532 }
1533
1534 void Tensor::makeSharedDataTensor(const Tensor &src, size_t offset) {
1535   if (strides != src.strides)
1536     throw std::invalid_argument(
1537       "Creating shared tensor of different stride than source tensor.");
1538
1539   if (getDim().getDataLen() + offset > src.getDim().getDataLen())
1540     throw std::invalid_argument(
1541       "Creating shared tensor of different size or stride than source tensor.");
1542
1543   /**
1544    * In this case, its the caller's responsibility to ensure that allocate() is
1545    * called for the output tensor before operating on the output tensor.
1546    */
1547   createSharedDataTensor(src, *this, offset);
1548 }
1549
1550 void Tensor::apply_broadcast(
1551   Tensor const &m,
1552   std::function<void(const BroadcastInfo &e, const float *, const float *,
1553                      float *)>
1554     v_func,
1555   Tensor &output) const {
1556   CREATE_IF_EMPTY_DIMS(output, dim);
1557
1558   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
1559     << getName() << " is not allocated";
1560   NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
1561     << m.getName() << " is not allocated";
1562   NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
1563     << output.getName() << " is not allocated";
1564
1565   /// shortcut to cover when dimension matches
1566   /// note that buffer_size, the last stride is only used in v_func but it
1567   /// might be changed
1568   if (dim == m.dim) {
1569     BroadcastInfo e;
1570     e.buffer_size = size();
1571     e.strides[3] = 1;
1572     e.tensor_type = getTensorType();
1573     v_func(e, getData(), m.getData(), output.getData());
1574     return;
1575   }
1576
1577   return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1578 }
1579
1580 #ifdef ENABLE_FP16
1581 void Tensor::apply_broadcast(
1582   Tensor const &m,
1583   std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1584                      __fp16 *)>
1585     v_func,
1586   Tensor &output) const {
1587   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
1588
1589   NNTR_THROW_IF(getData<__fp16>() == nullptr, std::invalid_argument)
1590     << getName() << " is not allocated";
1591   NNTR_THROW_IF(m.getData<__fp16>() == nullptr, std::invalid_argument)
1592     << m.getName() << " is not allocated";
1593   NNTR_THROW_IF(output.getData<__fp16>() == nullptr, std::invalid_argument)
1594     << output.getName() << " is not allocated";
1595
1596   /// shortcut to cover when dimension matches
1597   /// note that buffer_size, the last stride is only used in v_func but it
1598   /// might be changed
1599   if (dim == m.dim) {
1600     BroadcastInfo e;
1601     e.buffer_size = size();
1602     e.strides[3] = 1;
1603     v_func(e, getData<__fp16>(), m.getData<__fp16>(), output.getData<__fp16>());
1604     return;
1605   }
1606
1607   return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1608 }
1609
1610 void Tensor::apply_broadcast_util(
1611   Tensor const &m,
1612   std::function<void(const BroadcastInfo &e, const __fp16 *, const __fp16 *,
1613                      __fp16 *)>
1614     v_func,
1615   Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1616   size_t m_offset) const {
1617
1618   const __fp16 *buf = this->getData<__fp16>();
1619   const __fp16 *m_buf = m.getData<__fp16>();
1620   __fp16 *out_buf = output.getData<__fp16>();
1621
1622   if (e.buffer_axis == cur_axis) {
1623     v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1624     return;
1625   }
1626
1627   cur_axis++;
1628   for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
1629     size_t next_offset = offset + i * strides[cur_axis];
1630     size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1631     apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1632                          next_m_offset);
1633   }
1634 }
1635
1636 #endif
1637
1638 void Tensor::apply_broadcast_util(
1639   Tensor const &m,
1640   std::function<void(const BroadcastInfo &e, const float *, const float *,
1641                      float *)>
1642     v_func,
1643   Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1644   size_t m_offset) const {
1645
1646   const float *buf = this->getData();
1647   const float *m_buf = m.getData();
1648   float *out_buf = output.getData();
1649
1650   if (e.buffer_axis == cur_axis) {
1651     v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1652     return;
1653   }
1654
1655   cur_axis++;
1656   uint continuity[4] = {0, 1, 2, 3};
1657   if (getFormat() == Tformat::NHWC) {
1658     continuity[1] = 2;
1659     continuity[2] = 3;
1660     continuity[3] = 1;
1661   }
1662   for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
1663     size_t next_offset = offset + i * strides[cur_axis];
1664     size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1665     apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1666                          next_m_offset);
1667   }
1668 }
1669
1670 /**
1671  * This is to sum the Tensor data according to the dim.batch().
1672  * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
1673  */
1674 Tensor Tensor::sum_by_batch() const {
1675   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1676     << getName() << " is not contiguous, cannot sum";
1677
1678   Tensor ret(dim.batch(), 1, 1, 1, this->getFormat(), getDataType());
1679   size_t feat_len = dim.getFeatureLen();
1680   size_t batch = dim.batch();
1681
1682   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1683     const float *data = getData();
1684     float *rdata = ret.getData();
1685
1686     Tensor ones(1, 1, 1, feat_len, this->getFormat());
1687     ones.setValue(1.0);
1688     sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1689           ones.getData<float>(), 1, 0.0, rdata, 1);
1690   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1691 #ifdef ENABLE_FP16
1692     const __fp16 *data = getData<__fp16>();
1693     __fp16 *rdata = ret.getData<__fp16>();
1694
1695     Tensor ones(1, 1, 1, feat_len, this->getTensorType());
1696     ones.setValue((__fp16)1.0);
1697     sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
1698           ones.getData<__fp16>(), 1, 0.0, rdata, 1);
1699 #else
1700     throw std::invalid_argument("Error: enable-fp16 is not enabled");
1701 #endif
1702   }
1703
1704   return ret;
1705 }
1706
1707 /**
1708  * @brief Calculate sum according to the axis.
1709  */
1710 Tensor Tensor::sum(unsigned int axis, float alpha) const {
1711   Tensor ret("", this->getFormat(), this->getDataType());
1712   return sum(axis, ret, alpha, 0);
1713 }
1714
1715 Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
1716                     float beta) const {
1717
1718   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
1719     const float *data = getData<float>();
1720
1721     NNTR_THROW_IF(!contiguous, std::invalid_argument)
1722       << getName() << " is not contiguous, cannot sum";
1723
1724     if (axis >= 4)
1725       throw std::out_of_range("Error: axis is invalid");
1726
1727     if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1728       CREATE_IF_EMPTY_DIMS(ret, dim);
1729       ret.copy(this->getData());
1730       return ret;
1731     }
1732
1733     switch (axis) {
1734     case 0: {
1735       CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1736                            this->getTensorType());
1737       size_t feat_len = dim.getFeatureLen();
1738       size_t batch = dim.batch();
1739       Tensor ones(1, 1, 1, batch, this->getFormat());
1740       ones.setValue(alpha);
1741       sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1742             ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1743     } break;
1744     case 1: {
1745       CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1746       if (this->getFormat() == Tformat::NHWC) {
1747         unsigned int m = ret.dim.getDataLen();
1748         unsigned int n = dim[1];
1749         Tensor ones(1, 1, 1, n, this->getTensorType());
1750         ones.setValue(alpha);
1751         sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1752               ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1753       } else {
1754         unsigned int feat_len = dim[2] * dim[3];
1755         unsigned int t_axis = dim[1];
1756         Tensor ones(1, 1, 1, t_axis, getTensorType());
1757         ones.setValue(alpha);
1758         float *rdata = ret.getData<float>();
1759         for (unsigned int k = 0; k < dim[0]; ++k) {
1760           sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1761                 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1762                 1, beta, &rdata[k * feat_len], 1);
1763         }
1764       }
1765     } break;
1766     case 2: {
1767       CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1768
1769       if (this->getFormat() == Tformat::NHWC) {
1770         unsigned int feat_len = dim[1] * dim[3];
1771         unsigned int t_axis = dim[2];
1772         Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1773         ones.setValue(alpha);
1774         float *rdata = ret.getData<float>();
1775         for (unsigned int k = 0; k < dim[0]; ++k) {
1776           sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1777                 &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
1778                 1, beta, &rdata[k * feat_len], 1);
1779         }
1780       } else {
1781         unsigned int t_3 = dim[3];
1782         unsigned int t_axis = dim[2];
1783         Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1784         ones.setValue(alpha);
1785         float *rdata = ret.getData<float>();
1786         for (unsigned int k = 0; k < dim[0]; ++k) {
1787           for (unsigned int c = 0; c < dim[1]; ++c) {
1788             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1789             unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1790             sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1791                   ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1792           }
1793         }
1794       }
1795     } break;
1796     case 3: {
1797       CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
1798                            this->getTensorType());
1799       if (this->getFormat() == Tformat::NHWC) {
1800         unsigned int t_3 = dim[1];
1801         unsigned int t_axis = dim[3];
1802         Tensor ones(1, 1, 1, t_axis, this->getTensorType());
1803         ones.setValue(alpha);
1804         float *rdata = ret.getData<float>();
1805         for (unsigned int k = 0; k < dim[0]; ++k) {
1806           for (unsigned int c = 0; c < dim[2]; ++c) {
1807             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1808             unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1809             sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1810                   ones.getData<float>(), 1, beta, &rdata[ridx], 1);
1811           }
1812         }
1813       } else {
1814         unsigned int m = ret.dim.getDataLen();
1815         unsigned int n = dim[3];
1816         Tensor ones(1, 1, 1, n);
1817         ones.setValue(alpha);
1818         sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1819               ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
1820       }
1821     } break;
1822     default:
1823       throw std::out_of_range("Error: Dimension cannot exceed 3");
1824     }
1825   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
1826 #ifdef ENABLE_FP16
1827     const __fp16 *data = getData<__fp16>();
1828
1829     NNTR_THROW_IF(!contiguous, std::invalid_argument)
1830       << getName() << " is not contiguous, cannot sum";
1831
1832     if (axis >= 4)
1833       throw std::out_of_range("Error: axis is invalid");
1834
1835     if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
1836       CREATE_IF_EMPTY_DIMS(ret, dim);
1837       ret.copy(this->getData<__fp16>());
1838       return ret;
1839     }
1840
1841     switch (axis) {
1842     case 0: {
1843       CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
1844                            this->getTensorType());
1845       size_t feat_len = dim.getFeatureLen();
1846       size_t batch = dim.batch();
1847       Tensor ones(1, 1, 1, batch, this->getTensorType());
1848       ones.setValue(alpha);
1849       sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
1850             ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1851     } break;
1852     case 1: {
1853       CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
1854       if (this->getFormat() == Tformat::NHWC) {
1855         unsigned int m = ret.dim.getDataLen();
1856         unsigned int n = dim[1];
1857         Tensor ones(1, 1, 1, n, this->getTensorType());
1858         ones.setValue(alpha);
1859         sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1860               ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1861       } else {
1862         unsigned int feat_len = dim[2] * dim[3];
1863         unsigned int t_axis = dim[1];
1864         Tensor ones(1, 1, 1, t_axis, getTensorType());
1865         ones.setValue(alpha);
1866         __fp16 *rdata = ret.getData<__fp16>();
1867         for (unsigned int k = 0; k < dim[0]; ++k) {
1868           sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1869                 &data[k * dim.getFeatureLen()], feat_len,
1870                 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1871         }
1872       }
1873     } break;
1874     case 2: {
1875       CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
1876
1877       if (this->getFormat() == Tformat::NHWC) {
1878         unsigned int feat_len = dim[1] * dim[3];
1879         unsigned int t_axis = dim[2];
1880         Tensor ones(1, 1, 1, t_axis, getTensorType());
1881         ones.setValue(alpha);
1882         __fp16 *rdata = ret.getData<__fp16>();
1883         for (unsigned int k = 0; k < dim[0]; ++k) {
1884           sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
1885                 &data[k * dim.getFeatureLen()], feat_len,
1886                 ones.getData<__fp16>(), 1, beta, &rdata[k * feat_len], 1);
1887         }
1888       } else {
1889         unsigned int t_3 = dim[3];
1890         unsigned int t_axis = dim[2];
1891         Tensor ones(1, 1, 1, t_axis, getTensorType());
1892         ones.setValue(alpha);
1893         __fp16 *rdata = ret.getData<__fp16>();
1894         for (unsigned int k = 0; k < dim[0]; ++k) {
1895           for (unsigned int c = 0; c < dim[1]; ++c) {
1896             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
1897             unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
1898             sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1899                   ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1900           }
1901         }
1902       }
1903     } break;
1904     case 3: {
1905       CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1, getTensorType());
1906       if (this->getFormat() == Tformat::NHWC) {
1907         unsigned int t_3 = dim[1];
1908         unsigned int t_axis = dim[3];
1909         Tensor ones(1, 1, 1, t_axis, getTensorType());
1910         ones.setValue(alpha);
1911         __fp16 *rdata = ret.getData<__fp16>();
1912         for (unsigned int k = 0; k < dim[0]; ++k) {
1913           for (unsigned int c = 0; c < dim[2]; ++c) {
1914             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
1915             unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
1916             sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
1917                   ones.getData<__fp16>(), 1, beta, &rdata[ridx], 1);
1918           }
1919         }
1920       } else {
1921         unsigned int m = ret.dim.getDataLen();
1922         unsigned int n = dim[3];
1923         Tensor ones(1, 1, 1, n, getTensorType());
1924         ones.setValue(alpha);
1925         sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
1926               ones.getData<__fp16>(), 1, beta, ret.getData<__fp16>(), 1);
1927       }
1928     } break;
1929     default:
1930       throw std::out_of_range("Error: Dimension cannot exceed 3");
1931     }
1932 #else
1933     throw std::invalid_argument("Error: enable-fp16 is not enabled");
1934 #endif
1935   }
1936   return ret;
1937 }
1938
1939 Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
1940   Tensor ret("", this->getFormat());
1941   return sum(axes, ret, alpha);
1942 }
1943
1944 void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
1945   std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1946   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1947     << getName() << " is not contiguous, cannot merge axis";
1948
1949   if (axis2 != axis1 + 1)
1950     if (!checkContinuous(axis1, axis2))
1951       throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
1952
1953   dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
1954   dim.setTensorDim(axis1, 1);
1955 }
1956
1957 Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
1958                     float alpha) const {
1959   if (axes.empty())
1960     throw std::invalid_argument("empty axes given");
1961
1962   if (axes.size() == 1) {
1963     this->sum(axes[0], output, alpha);
1964   } else {
1965     /** club axes together */
1966     Tensor new_reshaped = *this;
1967     std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
1968     std::vector<unsigned int> new_axes = {axes[0]};
1969
1970     for (unsigned int i = 1; i < axes.size(); ++i) {
1971       if (checkContinuous(axes[i - 1], axes[i])) {
1972         new_reshaped.mergeAxis(axes[i - 1], axes[i]);
1973         new_axes.back() = axes[i];
1974       } else {
1975         new_axes.push_back(axes[i]);
1976       }
1977     }
1978
1979     Tensor ret = new_reshaped.sum(new_axes[0]);
1980     for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
1981       ret = ret.sum(axes[i]);
1982     ret.sum(new_axes.back(), output, alpha);
1983   }
1984
1985   return output;
1986 }
1987
1988 Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
1989                            bool trans_m, float beta) const {
1990   if (!result.isAllocated())
1991     throw std::invalid_argument(
1992       "Output tensor must be preallocated for dotBatched operation");
1993   for (unsigned int b = 0; b < batch(); b++) {
1994     /** @todo try using transpose to speedup the operation */
1995     const Tensor this_b = this->getBatchSlice(b, 1);
1996     Tensor m_b = m.getBatchSlice(b, 1);
1997     Tensor result_b = result.getBatchSlice(b, 1);
1998
1999     this_b.dot(m_b, result_b, trans, trans_m, beta);
2000   }
2001
2002   return result;
2003 }
2004
2005 Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
2006   Tensor output("", this->getFormat(), this->getDataType());
2007   dot(m, output, trans, trans_m);
2008
2009   return output;
2010 }
2011 /**
2012  * @brief compute the derivative of this in the current tensor
2013  * @todo will have to see if beta effects this computation
2014  */
2015 Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
2016                                 bool trans, bool trans_m, float beta) {
2017   bool deriv_trans_m = true;
2018   bool deriv_trans = false;
2019   /** @todo handle all cases of trans and trans_m */
2020   if (!trans && trans_m) {
2021     deriv_trans_m = false;
2022   }
2023
2024   return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
2025 }
2026
2027 /**
2028  * @brief compute the derivative wrt m in the m tensor
2029  * @note The caller tensor must be the same tensor as the one which called the
2030  * dot() product.
2031  */
2032 Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
2033                                 bool trans, bool trans_m, float beta) const {
2034   bool deriv_trans_m = false;
2035   bool deriv_trans = true;
2036   /** @todo handle all cases of trans and trans_m */
2037
2038   if (!trans && trans_m) {
2039     output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
2040     return m_deriv;
2041   } else {
2042     return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
2043   }
2044 }
2045
2046 Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
2047                                         Tensor const &output_deriv, bool trans,
2048                                         bool trans_m, float beta) {
2049   bool deriv_trans_m = true;
2050   bool deriv_trans = false;
2051   /** @todo handle all cases of trans and trans_m */
2052   if (!trans && trans_m) {
2053     deriv_trans_m = false;
2054   }
2055
2056   return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
2057 }
2058
2059 Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
2060                                         Tensor const &output_deriv, bool trans,
2061                                         bool trans_m, float beta) const {
2062   bool deriv_trans_m = false;
2063   bool deriv_trans = true;
2064   /** @todo handle all cases of trans and trans_m */
2065
2066   if (!trans && trans_m) {
2067     output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
2068     return m_deriv;
2069   } else {
2070     return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
2071   }
2072 }
2073
2074 /**
2075  * @note: This dot product flattens the fist 3 axis for the purpose of
2076  * computation. So, while performing, these matrices are behaving as 2-D
2077  * matrices. The dimensions are restored while returning back the tensor
2078  * in case of trans is false.
2079  */
2080 Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
2081                     float beta) const {
2082   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2083     << getName() << " is not contiguous. Cannot dot product.";
2084
2085   // Comment out with intension to support the calculation wrt. batch and height
2086   // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
2087   // [BxCxH,W] as well if (m.dim.rank() > 2) {
2088   //   throw exception::not_supported("Error: support only for rank of dot "
2089   //                                  "matrix <= 2");
2090   // }
2091
2092   // Comment out with intension to support the calculation wrt. batch and height
2093   // direction of this tensor. It is OK as long as m is 2D
2094   //
2095   if (trans && dim.rank() > 2) {
2096     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
2097   }
2098   unsigned int dim1, dim2, mdim1, mdim2;
2099   if (getFormat() == Tformat::NHWC) {
2100     dim1 = batch() * height() * width();
2101     dim2 = channel();
2102     mdim1 = m.batch() * m.height() * m.width();
2103     mdim2 = m.channel();
2104   } else {
2105     dim1 = batch() * channel() * height();
2106     dim2 = width();
2107     mdim1 = m.batch() * m.channel() * m.height();
2108     mdim2 = m.width();
2109   }
2110
2111   unsigned int M, N, K, lda, ldb, ldc;
2112
2113   if (!trans && !trans_m) {
2114     if (dim2 != mdim1)
2115       throw std::runtime_error(
2116         "Error: incompatible dimensions for dot product");
2117     K = mdim1; /** == dim2 */
2118     N = mdim2;
2119     M = dim1;
2120     if (getFormat() == Tformat::NHWC) {
2121       CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2122                            getTensorType()); //  NHWC Result Tensor
2123     } else {
2124       CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2125                            getTensorType());
2126     }
2127
2128     // We are not set zero the result because of performance reason.
2129     // However, result is not initialized properly. There might include
2130     // garbage like nan. When we have to use this value as in C = alpha*A*B +
2131     // beta*C, then have to check garbage data of C is not effect or not.
2132
2133   } else if (!trans && trans_m) {
2134     if (dim2 != mdim2)
2135       throw std::runtime_error(
2136         "Error: incompatible dimensions for dot product");
2137     K = mdim2; /** == dim2 */
2138     N = mdim1;
2139     M = dim1;
2140     if (getFormat() == Tformat::NHWC) {
2141       CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
2142                            getTensorType());
2143     } else {
2144       CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2145                            getTensorType());
2146       CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2147                            getTensorType());
2148       CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
2149                            getTensorType());
2150     }
2151   } else if (trans && !trans_m) {
2152     if (dim1 != mdim1)
2153       throw std::runtime_error(
2154         "Error: incompatible dimensions for dot product");
2155     K = mdim1; /** == dim1 */
2156     N = mdim2;
2157     M = dim2;
2158     if (getFormat() == Tformat::NHWC) {
2159       CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2160     } else {
2161       CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2162     }
2163   } else {
2164     if (dim1 != mdim2)
2165       throw std::runtime_error(
2166         "Error: incompatible dimensions for dot product");
2167     K = mdim2; /** == dim1 */
2168     N = mdim1;
2169     M = dim2;
2170     if (getFormat() == Tformat::NHWC) {
2171       CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
2172     } else {
2173       CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
2174     }
2175   }
2176   lda = dim2;
2177   ldb = mdim2;
2178   ldc = (getFormat() == Tformat::NHWC) ? result.channel() : result.width();
2179
2180   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2181     const float *data = getData();
2182     const float *mdata = m.getData();
2183     float *rdata = result.getData();
2184     const float alpha = 1.0f;
2185     enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2186     enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2187
2188     /// shortcut handling in case of vector
2189     /// for vector, (1 * K) == (K * 1) in current memory layout...
2190     /// and plaese note that N, K, M is a fixed place holder after considering
2191     /// transpose.
2192     /// For example, there is no case like (1 * K) X (1 * K) while
2193     /// (1 * K) X (1 * M) can be a case
2194     /// case1: (1 * K) X (K * 1)
2195     if (M == 1 && N == 1) {
2196       *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2197     }
2198     /// case2: (M * K) X (K * 1)
2199     else if (N == 1) {
2200       sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2201             rdata, 1);
2202     }
2203     /// case3: (1 * K) X (K * N) = 1 * N = R
2204     /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2205     /// Effectively a translation of sgemv
2206     else if (M == 1) {
2207       transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2208       sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2209             beta, rdata, 1);
2210     }
2211     /// case others: use gemm
2212     else {
2213       sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2214             ldb, beta, rdata, ldc);
2215     }
2216   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2217 #ifdef ENABLE_FP16
2218     const __fp16 *data = getData<__fp16>();
2219     const __fp16 *mdata = m.getData<__fp16>();
2220     __fp16 *rdata = result.getData<__fp16>();
2221     const float alpha = 1.0f;
2222     enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
2223     enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
2224
2225     /// shortcut handling in case of vector
2226     /// for vector, (1 * K) == (K * 1) in current memory layout...
2227     /// and plaese note that N, K, M is a fixed place holder after considering
2228     /// transpose.
2229     /// For example, there is no case like (1 * K) X (1 * K) while
2230     /// (1 * K) X (1 * M) can be a case
2231     /// case1: (1 * K) X (K * 1)
2232     if (M == 1 && N == 1) {
2233       *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
2234     }
2235     /// case2: (M * K) X (K * 1)
2236     else if (N == 1) {
2237       sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
2238             rdata, 1);
2239     }
2240     /// case3: (1 * K) X (K * N) = 1 * N = R
2241     /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
2242     /// Effectively a translation of sgemv
2243     else if (M == 1) {
2244       transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
2245       sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
2246             beta, rdata, 1);
2247     }
2248     /// case others: use sgemm
2249     else {
2250       sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
2251             ldb, beta, rdata, ldc);
2252     }
2253 #else
2254     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2255 #endif
2256   }
2257
2258   return result;
2259 }
2260
2261 Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
2262   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2263     << getName() << " is not contiguous. Cannot transpose.";
2264
2265   if (out.getData() == getData()) {
2266     Tensor tmp = clone();
2267     return tmp.transpose(direction, out);
2268   }
2269
2270   unsigned int SL, SI, SJ, SK;
2271
2272   out.reshape(dim.transpose(direction));
2273
2274   int indexI = direction[0] - '0';
2275   int indexJ = direction[2] - '0';
2276
2277   SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
2278
2279   bool is_format_nchw = (getFormat() == Tformat::NCHW);
2280
2281   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2282     const float *inptr = getData();
2283     float *outptr = out.getData();
2284     switch (indexI) {
2285     case 0:
2286       if (indexJ == 1) {
2287         if (is_format_nchw) {
2288           transposeloop(l, i, j, k, SL, SI, SJ, SK);
2289         } else {
2290           transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2291         }
2292       } else {
2293         if (is_format_nchw) {
2294           transposeloop(l, i, k, j, SL, SI, SK, SJ);
2295         } else {
2296           transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2297         }
2298       }
2299       break;
2300     case 1:
2301       if (indexJ == 0) {
2302         if (is_format_nchw) {
2303           transposeloop(l, j, i, k, SL, SJ, SI, SK);
2304         } else {
2305           transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2306         }
2307       } else {
2308         if (is_format_nchw) {
2309           transposeloop(l, j, k, i, SL, SJ, SK, SI);
2310         } else {
2311           transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2312         }
2313       }
2314       break;
2315     case 2:
2316       if (indexJ == 0) {
2317         if (is_format_nchw) {
2318           transposeloop(l, k, i, j, SL, SK, SI, SJ);
2319         } else {
2320           transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2321         }
2322       } else {
2323         if (is_format_nchw) {
2324           transposeloop(l, k, j, i, SL, SK, SJ, SI);
2325         } else {
2326           transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2327         }
2328       }
2329       break;
2330     }
2331   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2332 #ifdef ENABLE_FP16
2333     const __fp16 *inptr = getData<__fp16>();
2334     __fp16 *outptr = out.getData<__fp16>();
2335     switch (indexI) {
2336     case 0:
2337       if (indexJ == 1) {
2338         if (is_format_nchw) {
2339           transposeloop(l, i, j, k, SL, SI, SJ, SK);
2340         } else {
2341           transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
2342         }
2343       } else {
2344         if (is_format_nchw) {
2345           transposeloop(l, i, k, j, SL, SI, SK, SJ);
2346         } else {
2347           transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
2348         }
2349       }
2350       break;
2351     case 1:
2352       if (indexJ == 0) {
2353         if (is_format_nchw) {
2354           transposeloop(l, j, i, k, SL, SJ, SI, SK);
2355         } else {
2356           transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
2357         }
2358       } else {
2359         if (is_format_nchw) {
2360           transposeloop(l, j, k, i, SL, SJ, SK, SI);
2361         } else {
2362           transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
2363         }
2364       }
2365       break;
2366     case 2:
2367       if (indexJ == 0) {
2368         if (is_format_nchw) {
2369           transposeloop(l, k, i, j, SL, SK, SI, SJ);
2370         } else {
2371           transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
2372         }
2373       } else {
2374         if (is_format_nchw) {
2375           transposeloop(l, k, j, i, SL, SK, SJ, SI);
2376         } else {
2377           transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
2378         }
2379       }
2380       break;
2381     }
2382 #else
2383     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2384 #endif
2385   }
2386
2387   return out;
2388 }
2389
2390 Tensor Tensor::transpose(const std::string &direction) const {
2391   Tensor result(dim);
2392   transpose(direction, result);
2393   return result;
2394 }
2395
2396 Tensor Tensor::dropout_mask(float dropout) const {
2397   Tensor result(dim);
2398   result.dropout_mask(dropout);
2399   return result;
2400 }
2401
2402 void Tensor::dropout_mask(float dropout) {
2403   setRandUniform(0.0, 1.0);
2404   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2405     float scale = 1.0 / (1 - dropout);
2406     float *data_ = getData();
2407     for (unsigned int i = 0; i < size(); ++i) {
2408       if (data_[i] >= dropout)
2409         data_[i] = scale;
2410       else
2411         data_[i] = 0.0;
2412     }
2413   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2414 #ifdef ENABLE_FP16
2415     __fp16 scale = 1.0 / (1 - dropout);
2416     __fp16 *data_ = getData<__fp16>();
2417     for (unsigned int i = 0; i < size(); ++i) {
2418       if (data_[i] >= dropout)
2419         data_[i] = scale;
2420       else
2421         data_[i] = 0.0;
2422     }
2423 #else
2424     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2425 #endif
2426   }
2427 }
2428
2429 void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
2430   float fill_mask_val = 0.0;
2431   float en_mask_val = 1.0 - fill_mask_val;
2432
2433   if (reverse) {
2434     fill_mask_val = 1.0;
2435     en_mask_val = 1.0 - fill_mask_val;
2436   }
2437
2438   setValue(fill_mask_val);
2439   if (mask_len.batch() != batch())
2440     throw std::invalid_argument("Number of filter masks mismatched");
2441   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2442     for (unsigned int b = 0; b < batch(); b++) {
2443       float *addr = getAddress(b, 0, 0, 0);
2444       const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2445       std::fill(addr, addr + (*mask_len_val), en_mask_val);
2446     }
2447   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2448 #ifdef ENABLE_FP16
2449     for (unsigned int b = 0; b < batch(); b++) {
2450       __fp16 *addr = getAddress<__fp16>(b, 0, 0, 0);
2451       const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
2452       std::fill(addr, addr + (*mask_len_val), (__fp16)en_mask_val);
2453     }
2454 #else
2455     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2456 #endif
2457   }
2458 }
2459
2460 Tensor Tensor::zoneout_mask(float zoneout) {
2461   Tensor ret(getDim());
2462   zoneout_mask(ret, zoneout);
2463   return ret;
2464 }
2465
2466 void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
2467   if (dim != opposite.dim) {
2468     throw std::invalid_argument(
2469       "[Tensor::zoneout_mask] opposite dimension does not match");
2470   }
2471
2472   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2473     opposite.setRandBernoulli(zoneout);
2474
2475     float *data = getData();
2476     float *opposite_data = opposite.getData();
2477
2478     for (unsigned int i = 0; i < size(); ++i) {
2479       if (opposite_data[i] > epsilon) {
2480         data[i] = 0.0f;
2481       } else {
2482         data[i] = 1.0f;
2483       }
2484     }
2485   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2486 #ifdef ENABLE_FP16
2487     __fp16 zoneout_fp16 = (__fp16)zoneout;
2488     opposite.setRandBernoulli(zoneout_fp16);
2489
2490     __fp16 *data = getData<__fp16>();
2491     __fp16 *opposite_data = opposite.getData<__fp16>();
2492
2493     for (unsigned int i = 0; i < size(); ++i) {
2494       if (opposite_data[i] > epsilon) {
2495         data[i] = (__fp16)0.0;
2496       } else {
2497         data[i] = (__fp16)1.0;
2498       }
2499     }
2500 #else
2501     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2502 #endif
2503   }
2504 }
2505
2506 // int Tensor::apply_i(std::function<float(float)> f) {
2507 //   Tensor result = *this;
2508 //   apply(f, result);
2509
2510 //   return ML_ERROR_NONE;
2511 // }
2512
2513 // Tensor Tensor::apply(std::function<float(float)> f) const {
2514 //   Tensor result;
2515 //   return apply(f, result);
2516 // }
2517
2518 // Tensor &Tensor::apply(std::function<float(float)> f, Tensor &output) const {
2519 //   CREATE_IF_EMPTY_DIMS(output, dim);
2520
2521 //   if (dim != output.dim) {
2522 //     /// @todo add unittest
2523 //     throw std::invalid_argument(
2524 //       "[Tensor::apply] output dimension does not match");
2525 //   }
2526
2527 //   if (contiguous && output.contiguous) {
2528 //     const float *data = getData();
2529 //     float *rdata = output.getData();
2530 //     std::transform(data, data + size(), rdata, f);
2531 //   } else if (strides[3] == 1 && output.strides[3] == 1) {
2532 //     /** @todo optimize this with combining these loops where stride is 1 */
2533 //     for (unsigned int b = 0; b < batch(); ++b) {
2534 //       for (unsigned int c = 0; c < channel(); ++c) {
2535 //         for (unsigned int h = 0; h < height(); ++h) {
2536 //           float *out_data = output.getAddress(b, c, h, 0);
2537 //           const float *in_data = getAddress(b, c, h, 0);
2538 //           std::transform(in_data, in_data + width(), out_data, f);
2539 //         }
2540 //       }
2541 //     }
2542 //   } else {
2543 //     for (unsigned int b = 0; b < batch(); ++b) {
2544 //       for (unsigned int c = 0; c < channel(); ++c) {
2545 //         for (unsigned int h = 0; h < height(); ++h) {
2546 //           for (unsigned int w = 0; w < width(); ++w) {
2547 //             output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
2548 //           }
2549 //         }
2550 //       }
2551 //     }
2552 //   }
2553
2554 //   return output;
2555 // }
2556
2557 Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
2558
2559 Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
2560                       Tensor &output) const {
2561   return f(*this, output);
2562 }
2563
2564 void Tensor::print(std::ostream &out) const {
2565   printInstance(out, this);
2566   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2567     const float *data = getData<float>();
2568     unsigned int len = size();
2569     out << "data addr: " << data << '\n';
2570     out << dim;
2571
2572     if (len > 100) {
2573       out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2574           << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2575           << ']' << std::endl;
2576       return;
2577     }
2578
2579     std::ios init(NULL);
2580     init.copyfmt(out);
2581     if (getFormat() == Tformat::NCHW) {
2582       for (unsigned int k = 0; k < batch(); k++) {
2583         for (unsigned int l = 0; l < channel(); l++) {
2584           for (unsigned int i = 0; i < height(); i++) {
2585             for (unsigned int j = 0; j < width(); j++) {
2586               out << std::setw(10) << std::setprecision(10)
2587                   << this->getValue<float>(k, l, i, j) << " ";
2588             }
2589             out << std::endl;
2590           }
2591           out << std::endl;
2592         }
2593         out << "-------" << std::endl;
2594       }
2595     } else {
2596       for (unsigned int k = 0; k < batch(); k++) {
2597         for (unsigned int i = 0; i < height(); i++) {
2598           for (unsigned int j = 0; j < width(); j++) {
2599             for (unsigned int l = 0; l < channel(); l++) {
2600               out << std::setw(10) << std::setprecision(10)
2601                   << this->getValue<float>(k, l, i, j) << " ";
2602             }
2603             out << std::endl;
2604           }
2605           out << std::endl;
2606         }
2607         out << "-------" << std::endl;
2608       }
2609       out.copyfmt(init);
2610     }
2611   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2612 #ifdef ENABLE_FP16
2613     const __fp16 *data = getData<__fp16>();
2614     unsigned int len = size();
2615     out << "data addr: " << data << '\n';
2616     out << dim;
2617
2618     if (len > 100) {
2619       out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
2620           << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
2621           << ']' << std::endl;
2622       return;
2623     }
2624
2625     std::ios init(NULL);
2626     init.copyfmt(out);
2627     if (getFormat() == Tformat::NCHW) {
2628       for (unsigned int k = 0; k < batch(); k++) {
2629         for (unsigned int l = 0; l < channel(); l++) {
2630           for (unsigned int i = 0; i < height(); i++) {
2631             for (unsigned int j = 0; j < width(); j++) {
2632               out << std::setw(10) << std::setprecision(10)
2633                   << this->getValue<__fp16>(k, l, i, j) << " ";
2634             }
2635             out << std::endl;
2636           }
2637           out << std::endl;
2638         }
2639         out << "-------" << std::endl;
2640       }
2641     } else {
2642       for (unsigned int k = 0; k < batch(); k++) {
2643         for (unsigned int i = 0; i < height(); i++) {
2644           for (unsigned int j = 0; j < width(); j++) {
2645             for (unsigned int l = 0; l < channel(); l++) {
2646               out << std::setw(10) << std::setprecision(10)
2647                   << this->getValue<__fp16>(k, l, i, j) << " ";
2648             }
2649             out << std::endl;
2650           }
2651           out << std::endl;
2652         }
2653         out << "-------" << std::endl;
2654       }
2655       out.copyfmt(init);
2656     }
2657 #else
2658     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2659 #endif
2660   }
2661 }
2662
2663 void Tensor::print_(std::ostream &out, uint opt) const {
2664   printInstance(out, this);
2665   const float *data = getData();
2666
2667   unsigned int len = size();
2668
2669   std::ios init(NULL);
2670   init.copyfmt(out);
2671   if (opt == 0) {
2672     if (getFormat() == Tformat::NCHW) {
2673       out << "{";
2674       for (unsigned int k = 0; k < batch(); k++) {
2675         out << "{";
2676         for (unsigned int i = 0; i < channel(); i++) {
2677           out << "{";
2678           for (unsigned int j = 0; j < height(); j++) {
2679             out << "{";
2680             for (unsigned int l = 0; l < width(); l++) {
2681               if (l < channel() - 1)
2682                 out << std::setw(10) << std::setprecision(10)
2683                     << this->getValue<float>(k, l, i, j) << ", ";
2684               else
2685                 out << std::setw(10) << std::setprecision(10)
2686                     << this->getValue<float>(k, l, i, j);
2687             }
2688             if (j < height() - 1)
2689               out << "},";
2690             else
2691               out << "}";
2692             out << std::endl;
2693           }
2694           if (i < channel() - 1)
2695             out << "},";
2696           else
2697             out << "}";
2698           out << std::endl;
2699         }
2700         if (k < batch() - 1)
2701           out << "},";
2702         else
2703           out << "}";
2704         out << std::endl;
2705       }
2706       out << "}";
2707     } else {
2708       out << "{";
2709       for (unsigned int k = 0; k < batch(); k++) {
2710         out << "{";
2711         for (unsigned int i = 0; i < height(); i++) {
2712           out << "{";
2713           for (unsigned int j = 0; j < width(); j++) {
2714             out << "{";
2715             for (unsigned int l = 0; l < channel(); l++) {
2716               if (l < channel() - 1)
2717                 out << std::setw(10) << std::setprecision(10)
2718                     << this->getValue<float>(k, l, i, j) << ", ";
2719               else
2720                 out << std::setw(10) << std::setprecision(10)
2721                     << this->getValue<float>(k, l, i, j);
2722             }
2723             if (j < width() - 1)
2724               out << "},";
2725             else
2726               out << "}";
2727             out << std::endl;
2728           }
2729           if (i < height() - 1)
2730             out << "},";
2731           else
2732             out << "}";
2733           out << std::endl;
2734         }
2735         if (k < batch() - 1)
2736           out << "},";
2737         else
2738           out << "}";
2739         out << std::endl;
2740       }
2741       out << "}";
2742     }
2743   } else {
2744     for (uint i = 0; i < len; ++i) {
2745       out << getData<float>()[i] << ", ";
2746     }
2747   }
2748 }
2749
2750 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
2751   m.print(out);
2752   return out;
2753 }
2754
2755 void Tensor::copy(const void *buf) {
2756   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2757     << getName() << "Tensor is not contiguous, cannot copy.";
2758
2759   if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2760 #ifdef ENABLE_FP16
2761     if (buf == getData<__fp16>()) {
2762       return;
2763     }
2764 #else
2765     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2766 #endif
2767   } else if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2768     if (buf == getData()) {
2769       return;
2770     }
2771   }
2772   // std::string type_ =
2773   //   (getDataType() == ml::train::TensorDim::DataType::FP16) ? "FP16" : "NO";
2774   // std::cout << type_ << std::endl;
2775
2776   if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2777 #ifdef ENABLE_FP16
2778     scopy(size(), (__fp16 *)buf, 1, getData<__fp16>(), 1);
2779 #else
2780     throw std::invalid_argument("Error: enable-fp16 is not enabled");
2781 #endif
2782   } else if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2783     scopy(size(), (float *)buf, 1, getData<float>(), 1);
2784   }
2785 }
2786
2787 void Tensor::copy_with_stride(const Tensor &from) {
2788
2789   if (dim == from.getDim()) {
2790     if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2791       for (unsigned int b = 0; b < batch(); ++b) {
2792         for (unsigned int c = 0; c < channel(); ++c) {
2793           for (unsigned int h = 0; h < height(); ++h) {
2794             for (unsigned int w = 0; w < width(); ++w) {
2795               setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2796             }
2797           }
2798         }
2799       }
2800     } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
2801 #ifdef ENABLE_FP16
2802       for (unsigned int b = 0; b < batch(); ++b) {
2803         for (unsigned int c = 0; c < channel(); ++c) {
2804           for (unsigned int h = 0; h < height(); ++h) {
2805             for (unsigned int w = 0; w < width(); ++w) {
2806               setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2807             }
2808           }
2809         }
2810       }
2811 #else
2812       throw std::invalid_argument("Error: enable-fp16 is not enabled");
2813 #endif
2814     }
2815   } else {
2816     Tensor t = Tensor(from.getDim(), true);
2817     if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
2818       for (unsigned int b = 0; b < t.batch(); ++b) {
2819         for (unsigned int c = 0; c < t.channel(); ++c) {
2820           for (unsigned int h = 0; h < t.height(); ++h) {
2821             for (unsigned int w = 0; w < t.width(); ++w) {
2822               t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
2823             }
2824           }
2825         }
2826       }
2827     } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
2828 #ifdef ENABLE_FP16
2829       for (unsigned int b = 0; b < batch(); ++b) {
2830         for (unsigned int c = 0; c < channel(); ++c) {
2831           for (unsigned int h = 0; h < height(); ++h) {
2832             for (unsigned int w = 0; w < width(); ++w) {
2833               setValue(b, c, h, w, from.getValue<__fp16>(b, c, h, w));
2834             }
2835           }
2836         }
2837       }
2838 #else
2839       throw std::invalid_argument("Error: enable-fp16 is not enabled");
2840 #endif
2841     }
2842     swap(t, *this);
2843   }
2844 }
2845
2846 void Tensor::copy(const Tensor &from) {
2847   // todo: enable copy to non-contiguous tensor
2848   if (!contiguous) {
2849     throw std::runtime_error("Cannot copy non-contiguous tensor");
2850   }
2851
2852   if (from.size() != 0 && size() == from.size() &&
2853       getDataType() == from.getDataType()) {
2854     reshape(from.getDim());
2855     if (getDataType() == ml::train::TensorDim::DataType::FP32) {
2856       copy(from.getData());
2857     } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
2858 #ifdef ENABLE_FP16
2859       copy(from.getData<__fp16>());
2860 #else
2861       throw std::invalid_argument("Error: enable-fp16 is not enabled");
2862 #endif
2863     }
2864
2865   } else {
2866     Tensor t = Tensor(from.getDim(), from.getData());
2867     swap(t, *this);
2868   }
2869 }
2870
2871 void Tensor::copyData(const Tensor &from) {
2872   // todo: enable copy to non-contiguous tensor
2873   if (!contiguous) {
2874     throw std::runtime_error("Cannot copy non-contiguous tensor");
2875   }
2876
2877   if (size() != from.size())
2878     throw std::invalid_argument("Size of tensor to copy must match");
2879
2880   if (getDataType() != from.getDataType())
2881     throw std::invalid_argument("Data type of tensor to copy must match");
2882
2883   copy(from.getData());
2884 }
2885
2886 Tensor Tensor::clone() const {
2887   Tensor t;
2888   t.copy(*this);
2889   t.name = name;
2890   return t;
2891 }
2892
2893 void Tensor::reshape(const TensorDim &d) {
2894
2895   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2896     << getName() << " is not contiguous, cannot reshape.";
2897
2898   NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
2899     << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
2900        "\nfrom "
2901     << getDim() << " to " << d;
2902
2903   dim = d;
2904   strides = d.computeStrides();
2905 }
2906
2907 void Tensor::fill(const Tensor &from, bool alloc) {
2908   if (alloc && this->empty()) {
2909     this->copy(from);
2910     return;
2911   }
2912
2913   if (!from.contiguous || !contiguous) {
2914     /// @todo enable this if needed
2915     throw nntrainer::exception::not_supported(
2916       "[Tensor::fill] non-contiguous tensors are not supported");
2917   }
2918
2919   if (dim != from.getDim()) {
2920     throw std::invalid_argument("[Tensor::fill] dimension must be the same");
2921   }
2922
2923   if (strides != from.getStrides()) {
2924     /// @todo length does not represent buffer size, there should be way to
2925     /// get the buffer size
2926     throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
2927   }
2928
2929   this->copy(from.getData());
2930 }
2931
2932 void Tensor::save(std::ostream &file) {
2933   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2934     << getName() << " is not contiguous, cannot save.";
2935
2936   std::streamsize sz = static_cast<std::streamsize>(bytes());
2937   NNTR_THROW_IF(sz < 0, std::invalid_argument)
2938     << "save size: " << bytes()
2939     << " is too big. It cannot be represented by std::streamsize";
2940
2941   checkedWrite(file, (char *)getData(), sz, "[Tensor::save] operation failed");
2942   putData();
2943 }
2944
2945 void Tensor::read(std::ifstream &file) {
2946   NNTR_THROW_IF(!contiguous, std::invalid_argument)
2947     << getName() << " is not contiguous, cannot read.";
2948
2949   std::streamsize sz = static_cast<std::streamsize>(bytes());
2950
2951   NNTR_THROW_IF(sz < 0, std::invalid_argument)
2952     << "read size: " << bytes()
2953     << " is too big. It cannot be represented by std::streamsize";
2954
2955   checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
2956   putData();
2957 }
2958
2959 /**
2960  * @brief Calculate average value according to the axis.
2961  */
2962 Tensor Tensor::average(unsigned int axis) const {
2963   Tensor t("", this->getFormat(), this->getDataType());
2964   return average(axis, t);
2965 }
2966
2967 /**
2968  * @brief Calculate average value according to the axis.
2969  */
2970 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
2971   if (axis >= TensorDim::MAXDIM)
2972     throw std::out_of_range(
2973       "negative axis or axis more then MAXDIM is invalid");
2974
2975   unsigned int axis_size = dim.getDim()[axis];
2976   if (axis_size == 1)
2977     output.copy(*this);
2978   else
2979     this->sum(axis, output, 1.0 / ((float)axis_size));
2980
2981   return output;
2982 }
2983
2984 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
2985   Tensor t("", this->getFormat(), this->getDataType());
2986   return average(axes, t);
2987 }
2988
2989 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
2990                         Tensor &output) const {
2991   if (axes.empty())
2992     return this->average(output);
2993
2994   TensorDim ret_shape(getTensorType());
2995
2996   for (const auto &idx : axes) {
2997     if (idx >= TensorDim::MAXDIM) {
2998       throw std::out_of_range("axis more then MAXDIM is invalid");
2999     }
3000     ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
3001   }
3002
3003   return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
3004 }
3005
3006 /**
3007  * @brief Calculate average value according to the axis.
3008  */
3009 Tensor Tensor::average() const {
3010   Tensor result = *this;
3011   unsigned int axis = 0;
3012   if (this->getFormat() == Tformat::NHWC) {
3013     result.reshape({1, dim.getDataLen(), 1, 1, this->getTensorType()});
3014     axis = 1;
3015   } else {
3016     result.reshape({1, 1, 1, dim.getDataLen(), this->getTensorType()});
3017     axis = 3;
3018   }
3019   return result.average(axis);
3020 }
3021
3022 /**
3023  * @brief Calculate average value according to the axis.
3024  */
3025 Tensor &Tensor::average(Tensor &output) const {
3026   Tensor result = *this;
3027   result.reshape({1, 1, 1, dim.getDataLen()});
3028   return result.average(3, output);
3029 }
3030
3031 void Tensor::setValue(float val) {
3032   NNTR_THROW_IF(!contiguous, std::invalid_argument)
3033     << getName() << " is not contiguous, cannot set value.";
3034   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3035     float *data = getData<float>();
3036     std::fill(data, data + size(), val);
3037   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3038 #ifdef ENABLE_FP16
3039     __fp16 *data = getData<__fp16>();
3040     std::fill(data, data + size(), (__fp16)val);
3041 #else
3042     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3043 #endif
3044   }
3045 }
3046
3047 void Tensor::setZero() {
3048   if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
3049     if (contiguous)
3050       sscal(size(), 0, getData<float>(), 1);
3051     else
3052       apply_i([](float val) -> float { return 0; });
3053   } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
3054 #ifdef ENABLE_FP16
3055     if (contiguous)
3056       sscal(size(), 0, getData<__fp16>(), 1);
3057     else
3058       apply_i([](__fp16 val) -> __fp16 { return 0; });
3059 #else
3060     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3061 #endif
3062   }
3063 }
3064
3065 std::vector<unsigned int> Tensor::argmax() const {
3066   NNTR_THROW_IF(!contiguous, std::invalid_argument)
3067     << getName() << " is not contiguous, cannot get argmax.";
3068   std::vector<unsigned int> result;
3069
3070   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3071     const float *data = getData();
3072     size_t batch_size = batch();
3073     size_t feature_len = dim.getFeatureLen();
3074
3075     result.resize(batch_size);
3076
3077     for (unsigned int b = 0; b < batch_size; b++) {
3078       auto max_iter =
3079         std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
3080       result[b] = std::distance(data, max_iter) - (b * feature_len);
3081     }
3082   }
3083   if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3084 #ifdef ENABLE_FP16
3085     const __fp16 *data = getData<__fp16>();
3086     size_t batch_size = batch();
3087     size_t feature_len = dim.getFeatureLen();
3088
3089     result.resize(batch_size);
3090
3091     for (unsigned int b = 0; b < batch_size; b++) {
3092       auto max_iter =
3093         std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
3094       result[b] = std::distance(data, max_iter) - (b * feature_len);
3095     }
3096 #else
3097     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3098 #endif
3099   }
3100
3101   return result;
3102 }
3103
3104 float Tensor::l2norm() const {
3105   NNTR_THROW_IF(!contiguous, std::invalid_argument)
3106     << getName() << " is not contiguous, cannot get l2norm.";
3107   float ret;
3108   unsigned int len = size();
3109   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3110     const float *data = getData<float>();
3111     ret = snrm2(len, data, 1);
3112   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3113 #ifdef ENABLE_FP16
3114     const __fp16 *data = getData<__fp16>();
3115     ret = snrm2(len, data, 1);
3116 #else
3117     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3118 #endif
3119   }
3120   return ret;
3121 }
3122
3123 float Tensor::max_abs() const {
3124   NNTR_THROW_IF(!contiguous, std::invalid_argument)
3125     << getName() << " is not contiguous, cannot get max_abs.";
3126
3127   unsigned int len = size();
3128   float ret;
3129   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3130     const float *data = getData<float>();
3131
3132     unsigned int idx = isamax(len, data, 1);
3133     ret = *(data + idx);
3134
3135   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3136 #ifdef ENABLE_FP16
3137     const __fp16 *data = getData<__fp16>();
3138
3139     unsigned int idx = isamax(len, data, 1);
3140     ret = *(data + idx);
3141 #else
3142     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3143 #endif
3144   }
3145   return ret;
3146 }
3147
3148 Tensor &Tensor::normalization(Tensor &output) const {
3149   if (output.empty())
3150     output = Tensor(dim);
3151
3152   output.copy(*this);
3153   output.normalization_i();
3154
3155   return output;
3156 }
3157
3158 void Tensor::normalization_i() {
3159   NNTR_THROW_IF(!contiguous, std::invalid_argument)
3160     << getName() << " is not contiguous, cannot do normalization.";
3161
3162   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3163     const float *data = getData();
3164
3165     auto bounds = std::minmax_element(data, data + size());
3166     const float min = *bounds.first;
3167     const float max = *bounds.second;
3168
3169     if (max == min) {
3170       Tensor tmp = *this;
3171       this->subtract_i(tmp);
3172     } else {
3173       this->subtract_i(min);
3174       this->divide_i(max - min);
3175     }
3176   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3177 #ifdef ENABLE_FP16
3178     const __fp16 *data = getData<__fp16>();
3179
3180     auto bounds = std::minmax_element(data, data + size());
3181     const __fp16 min = *bounds.first;
3182     const __fp16 max = *bounds.second;
3183
3184     if (max == min) {
3185       Tensor tmp = *this;
3186       this->subtract_i(tmp);
3187     } else {
3188       this->subtract_i(min);
3189       this->divide_i(max - min);
3190     }
3191 #else
3192     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3193 #endif
3194   }
3195 }
3196
3197 LazyTensor Tensor::chain() const { return LazyTensor(*this); }
3198
3199 Tensor &Tensor::standardization(Tensor &output) const {
3200   if (output.empty())
3201     output = Tensor(dim);
3202
3203   output.copy(*this);
3204   output.standardization_i();
3205
3206   return output;
3207 }
3208
3209 void Tensor::standardization_i() {
3210   Tensor mean_by_batch = this->sum_by_batch();
3211   mean_by_batch.divide_i(dim.getFeatureLen());
3212
3213   this->subtract_i(mean_by_batch);
3214   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
3215     Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3216     std_dev_by_batch.setZero();
3217     float *std_dev = std_dev_by_batch.getData();
3218
3219     for (unsigned int k = 0; k < dim.batch(); ++k) {
3220       Tensor sub_this = this->getBatchSlice(k, 1);
3221       std_dev[k] = sub_this.l2norm();
3222     }
3223
3224     std_dev_by_batch.divide_i(dim.getFeatureLen());
3225     this->divide_i(std_dev_by_batch);
3226   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
3227 #ifdef ENABLE_FP16
3228     Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
3229     std_dev_by_batch.setZero();
3230     __fp16 *std_dev = std_dev_by_batch.getData<__fp16>();
3231
3232     for (unsigned int k = 0; k < dim.batch(); ++k) {
3233       Tensor sub_this = this->getBatchSlice(k, 1);
3234       std_dev[k] = sub_this.l2norm();
3235     }
3236
3237     std_dev_by_batch.divide_i(dim.getFeatureLen());
3238     this->divide_i(std_dev_by_batch);
3239 #else
3240     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3241 #endif
3242   }
3243 }
3244
3245 Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
3246   if (m.size() > this->size())
3247     throw exception::not_supported("broadcasting *this is not supported");
3248
3249   const TensorDim m_dim = m.getDim();
3250
3251   BroadcastInfo e;
3252   e.tensor_type = getTensorType();
3253
3254   uint continuity[4] = {0, 1, 2, 3};
3255   if (getFormat() == Tformat::NHWC) {
3256     continuity[1] = 2;
3257     continuity[2] = 3;
3258     continuity[3] = 1;
3259   }
3260
3261   /// checking if given Tensor's can be broadcasted
3262   for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
3263     if (dim.getTensorDim(continuity[i]) == m_dim.getTensorDim(continuity[i])) {
3264       e.strides[i] = m.strides[i];
3265       continue;
3266     }
3267
3268     /// If given dimension is 1, it could be reused, the stride remaining 0
3269     /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
3270     /// If so, strides should not change
3271     if (m_dim.getTensorDim(continuity[i]) == 1) {
3272       continue;
3273     }
3274
3275     std::stringstream ss;
3276     ss << "[computeBroadcastInfo] broadcasting only allowed for "
3277           "dimension value of 1 \n"
3278        << "this: " << dim << "target: " << m_dim;
3279     throw std::invalid_argument(ss.str().c_str());
3280   }
3281
3282   /// calculate inner loop size
3283   e.buffer_size = 1;
3284   e.buffer_axis = -1;
3285   e.strides[3] = m.strides[3];
3286
3287   /// initiate buffer info with matching dimension strategy
3288   for (int axis = 3; axis >= 0; --axis) {
3289     if (dim.getTensorDim(continuity[axis]) !=
3290         m_dim.getTensorDim(continuity[axis])) {
3291       e.buffer_axis = axis;
3292       break;
3293     }
3294
3295     e.buffer_size *= dim.getTensorDim(continuity[axis]);
3296   }
3297
3298   /// check strategy that uses consecutive ones
3299   if (m_dim.getTensorDim(continuity[3]) == 1) {
3300     unsigned int inner_loop_size = 1;
3301     int axis;
3302     for (axis = 3; axis >= 0; --axis) {
3303       if (m_dim.getTensorDim(continuity[axis]) != 1) {
3304         break;
3305       }
3306
3307       inner_loop_size *= dim.getTensorDim(continuity[axis]);
3308     }
3309
3310     /// if consecutive-one strategy has bigger chunk size, replace the
3311     /// information
3312     if (inner_loop_size > e.buffer_size) {
3313       e.buffer_axis = axis;
3314       e.buffer_size = inner_loop_size;
3315       e.strides[3] = 0;
3316     }
3317   }
3318
3319   return e;
3320 }
3321
3322 Tensor Tensor::rotate_180(Tensor in) {
3323   Tensor output(in.getDim());
3324   if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
3325     output.setZero();
3326     for (unsigned int i = 0; i < in.batch(); ++i) {
3327       for (unsigned int j = 0; j < in.channel(); ++j) {
3328         for (unsigned int k = 0; k < in.height(); ++k) {
3329           for (unsigned int l = 0; l < in.width(); ++l) {
3330             output.setValue(i, j, k, l,
3331                             in.getValue<float>(i, j, (in.height() - k - 1),
3332                                                (in.width() - l - 1)));
3333           }
3334         }
3335       }
3336     }
3337   } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
3338 #ifdef ENABLE_FP16
3339     output.setZero();
3340     for (unsigned int i = 0; i < in.batch(); ++i) {
3341       for (unsigned int j = 0; j < in.channel(); ++j) {
3342         for (unsigned int k = 0; k < in.height(); ++k) {
3343           for (unsigned int l = 0; l < in.width(); ++l) {
3344             output.setValue(i, j, k, l,
3345                             in.getValue<__fp16>(i, j, (in.height() - k - 1),
3346                                                 (in.width() - l - 1)));
3347           }
3348         }
3349       }
3350     }
3351 #else
3352     throw std::invalid_argument("Error: enable-fp16 is not enabled");
3353 #endif
3354   }
3355   return output;
3356 }
3357
3358 } /* namespace nntrainer */