nntrainer/tensor/tensor.cpp

   1 /**
   2  * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *   http://www.apache.org/licenses/LICENSE-2.0
   8  * Unless required by applicable law or agreed to in writing, software
   9  * distributed under the License is distributed on an "AS IS" BASIS,
  10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11  * See the License for the specific language governing permissions and
  12  * limitations under the License.
  13  *
  14  *
  15  * @file        tensor.cpp
  16  * @date        04 December 2019
  17  * @brief       This is Tensor class for calculation
  18  * @see         https://github.com/nnstreamer/nntrainer
  19  * @author      Jijoong Moon <jijoong.moon@samsung.com>
  20  * @bug         No known bugs except for NYI items
  21  *
  22  */
  23
  24 #include <algorithm>
  25 #include <assert.h>
  26 #include <cmath>
  27 #include <cstring>
  28 #include <fstream>
  29 #include <iomanip>
  30 #include <iostream>
  31 #include <iterator>
  32 #include <numeric>
  33 #include <random>
  34 #include <regex>
  35 #include <sstream>
  36 #include <stdexcept>
  37 #include <stdio.h>
  38
  39 #include <blas_interface.h>
  40 #include <lazy_tensor.h>
  41 #include <nntrainer_error.h>
  42 #include <nntrainer_log.h>
  43 #include <tensor.h>
  44 #include <util_func.h>
  45
  46 #define transposeloop(cl, ci, cj, ck, sl, si, sj, sk)                 \
  47   do {                                                                \
  48     unsigned int i, j, k, l;                                          \
  49     int inidx = 0, outidx = 0;                                        \
  50     for (cl = 0; cl < sl; cl++)                                       \
  51       for (ci = 0; ci < si; ci++)                                     \
  52         for (cj = 0; cj < sj; cj++)                                   \
  53           for (ck = 0; ck < sk; ck++) {                               \
  54             outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
  55             inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k;      \
  56             outptr[outidx] = inptr[inidx];                            \
  57           }                                                           \
  58   } while (0);
  59
  60 #define CREATE_IF_EMPTY_DIMS(tensor, ...) \
  61   do {                                    \
  62     if (tensor.empty())                   \
  63       tensor = Tensor(__VA_ARGS__);       \
  64   } while (0);
  65 namespace nntrainer {
  66
  67 /**
  68  * @struct External Loop Info for broadcasted info
  69  * @brief External Loop Info for broadcasted iteration. Please refer to
  70  * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
  71  * @note This should better be implemented in iterator fashion before used
  72  * extensively.
  73  */
  74 struct Tensor::BroadcastInfo {
  75
  76   /**
  77    * @brief Construct a new External Loop Info object
  78    *
  79    */
  80   BroadcastInfo() : buffer_size(0), buffer_axis(-1), strides{0, 0, 0, 0} {}
  81
  82   unsigned int buffer_size; /**< virtual size of the buffer */
  83   int buffer_axis;          /**< the smallest axis that should be looped.
  84                                  -1 means no loop needed*/
  85   std::array<unsigned int, TensorDim::MAXDIM>
  86     strides; /**< modified strides for the loop */
  87 };
  88
  89 static auto rng = [] {
  90   std::mt19937 rng;
  91   rng.seed(getSeed());
  92   return rng;
  93 }();
  94
  95 Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
  96                std::string name_) :
  97   Tensor(name_) {
  98   if (d.getDataLen() != 0) {
  99     dim = d;
 100     strides = d.computeStrides();
 101     initializer = init;
 102
 103     if (alloc_now)
 104       allocate();
 105   }
 106 }
 107
 108 Tensor::Tensor(const TensorDim &d, const float *buf) : Tensor(d, true) {
 109   if (d.getDataLen() != 0) {
 110     if (buf != nullptr)
 111       copy(buf);
 112   }
 113 }
 114
 115 /**
 116  * @class SrcSharedTensor
 117  * @brief Source of the shared tensor
 118  */
 119 class SrcSharedTensor {
 120 public:
 121   /**
 122    * @brief   Constructor for the class
 123    */
 124   SrcSharedTensor() : src(nullptr), off(0) {}
 125
 126   SrcSharedTensor(const Tensor *tensor, unsigned int offset) :
 127     src(tensor),
 128     off(offset) {}
 129
 130   /**
 131    * @brief   Get the allocated src tensor
 132    */
 133   const Tensor *tensor() const {
 134     if (!src)
 135       throw std::runtime_error("Accessing empty src tensor");
 136
 137     return src;
 138   }
 139
 140   /**
 141    * @brief   Get the offset from the source tensor
 142    */
 143   unsigned int offset() const { return off; }
 144
 145 private:
 146   const Tensor *src; /**< Tensor of the source */
 147   unsigned int off;  /**< offset from the source data ptr */
 148 };
 149
 150 void Tensor::allocate() {
 151   if (empty() || data)
 152     /// already allocated
 153     return;
 154
 155   if (src_tensor) {
 156     /// allocate data based on the source tensor
 157     data = src_tensor->tensor()->data;
 158     offset = src_tensor->tensor()->offset + src_tensor->offset();
 159     /** as this memory is shared, do NOT initialize */
 160   } else {
 161     /// allocate new memory for the tensor data
 162     auto mem_data = new MemoryData<float>(new float[dim.getDataLen()]);
 163     data = std::shared_ptr<MemoryData<float>>(mem_data, [](auto *mem_data) {
 164       delete[] mem_data->getAddr();
 165       delete mem_data;
 166     });
 167     offset = 0;
 168     initialize();
 169   }
 170 }
 171
 172 Tensor Tensor::Map(float *buf, unsigned int bytes, const TensorDim &d,
 173                    int offset) {
 174   if (d.getDataLen() == 0 || buf == nullptr) {
 175     throw std::invalid_argument(
 176       "[Tensor::Map] empty tensor dim is not allowed");
 177   }
 178
 179   if (d.getDataLen() * sizeof(float) + offset > bytes) {
 180     throw std::invalid_argument(
 181       "Creating shared tensor of size bigger than tensor memory.");
 182   }
 183
 184   Tensor tmp;
 185   tmp.dim = d;
 186   tmp.strides = d.computeStrides();
 187   /// Tensor does not own the memory
 188   tmp.data = std::shared_ptr<MemoryData<float>>(
 189     new MemoryData<float>(buf), std::default_delete<MemoryData<float>>());
 190   tmp.offset = offset;
 191
 192   return tmp;
 193 }
 194
 195 bool Tensor::operator==(const Tensor &rhs) const {
 196   if (this->dim != rhs.dim)
 197     return false;
 198
 199   size_t len = size();
 200
 201   if (len != rhs.size())
 202     return false;
 203
 204   const float *data = getData();
 205   const float *rdata = rhs.getData();
 206
 207   if (contiguous != rhs.contiguous)
 208     return false;
 209
 210   if (strides != rhs.strides)
 211     return false;
 212
 213   for (size_t i = 0; i < len; ++i) {
 214     /** not checking sign change is intentional to avoid float calculation
 215      * errors around 0 */
 216     if ((std::isnan(data[i]) && !std::isnan(rdata[i])) ||
 217         (!std::isnan(data[i]) && std::isnan(rdata[i])) ||
 218         std::fabs(data[i] - rdata[i]) > epsilon)
 219       return false;
 220   }
 221
 222   return true;
 223 }
 224
 225 template <typename T> void Tensor::setDist(T dist) {
 226   NNTR_THROW_IF(!contiguous, std::invalid_argument)
 227     << getName() << " Tensor is not contiguous, cannot set distribution";
 228
 229   float *data = getData();
 230   unsigned int len = size();
 231   for (unsigned int i = 0; i < len; ++i) {
 232     data[i] = dist(rng);
 233   }
 234 }
 235
 236 void Tensor::setRandNormal(float mean, float std) {
 237   setDist<std::normal_distribution<float>>(
 238     std::normal_distribution<float>(mean, std));
 239 }
 240
 241 void Tensor::setRandUniform(float min, float max) {
 242   setDist<std::uniform_real_distribution<float>>(
 243     std::uniform_real_distribution<float>(min, max));
 244 }
 245
 246 void Tensor::setRandBernoulli(float probability) {
 247   setDist<std::bernoulli_distribution>(
 248     std::bernoulli_distribution(probability));
 249 }
 250
 251 void Tensor::initialize() {
 252   if (empty() || !isAllocated())
 253     return;
 254
 255   unsigned int fan_in, fan_out;
 256
 257   /// @fixme: when unit is equal to one, this does not work, we need to rely on
 258   /// effective dimension then actual numbers here. For now, some heuristics
 259   /// added to infer what would be fan_in/fan_out
 260   if (dim.batch() * dim.channel() * dim.height() == 1) {
 261     fan_out = fan_in = dim.width();
 262   } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
 263     fan_in = dim.height();
 264     fan_out = dim.width();
 265   } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
 266     auto field_size = dim.height() * dim.width();
 267
 268     // this also handles below cases.
 269     // 1. fan_in = fan_out = 1 as well.
 270     // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
 271     fan_in = dim.channel() * field_size;
 272     fan_out = dim.batch() * field_size;
 273   }
 274
 275   switch (initializer) {
 276   case Tensor::Initializer::ZEROS:
 277     setZero();
 278     break;
 279   case Tensor::Initializer::ONES:
 280     setValue(1.0f);
 281     break;
 282   case Tensor::Initializer::LECUN_NORMAL:
 283     setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
 284     break;
 285   case Tensor::Initializer::XAVIER_NORMAL:
 286     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
 287     break;
 288   case Tensor::Initializer::HE_NORMAL:
 289     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
 290     break;
 291   case Tensor::Initializer::LECUN_UNIFORM:
 292     setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
 293     break;
 294   case Tensor::Initializer::XAVIER_UNIFORM:
 295     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
 296                    sqrtFloat(6.0 / (fan_in + fan_out)));
 297     break;
 298   case Tensor::Initializer::HE_UNIFORM:
 299     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
 300                    sqrtFloat(6.0 / (fan_in)));
 301     break;
 302   default:
 303     break;
 304   }
 305
 306   putData();
 307 }
 308
 309 Tensor::Tensor(
 310   std::vector<std::vector<std::vector<std::vector<float>>>> const &d) {
 311
 312   if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
 313     throw std::out_of_range(
 314       "[Tensor] trying to initialize Tensor from empty vector");
 315   }
 316
 317   dim.batch(d.size());
 318   dim.channel(d[0].size());
 319   dim.height(d[0][0].size());
 320   dim.width(d[0][0][0].size());
 321   strides = dim.computeStrides();
 322   auto mem_data = new MemoryData<float>(new float[dim.getDataLen()]);
 323   data = std::shared_ptr<MemoryData<float>>(
 324     mem_data, [](auto *mem_data) { delete[] mem_data->getAddr(); });
 325   offset = 0;
 326   contiguous = true;
 327   initializer = Initializer::NONE;
 328
 329   for (unsigned int i = 0; i < dim.batch(); ++i)
 330     for (unsigned int j = 0; j < dim.channel(); ++j)
 331       for (unsigned int k = 0; k < dim.height(); ++k)
 332         for (unsigned int l = 0; l < dim.width(); ++l)
 333           this->setValue(i, j, k, l, d[i][j][k][l]);
 334 }
 335
 336 int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
 337   try {
 338     this->multiply_strided(m, *this, beta);
 339   } catch (std::exception &err) {
 340     ml_loge("%s %s", typeid(err).name(), err.what());
 341     return ML_ERROR_INVALID_PARAMETER;
 342   }
 343
 344   return ML_ERROR_NONE;
 345 }
 346
 347 Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
 348   Tensor t;
 349   return this->multiply_strided(m, t, beta);
 350 }
 351
 352 Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
 353                                  const float beta) const {
 354   /** TODO: throw than create new dimenions */
 355   CREATE_IF_EMPTY_DIMS(output, dim);
 356
 357   if (size() != m.size() || size() != output.size())
 358     throw std::invalid_argument(
 359       "Strided multiplication does not support broadcasting");
 360
 361   if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 362       beta != 0.0) {
 363     for (unsigned int b = 0; b < batch(); ++b) {
 364       for (unsigned int c = 0; c < channel(); ++c) {
 365         for (unsigned int h = 0; h < height(); ++h) {
 366           for (unsigned int w = 0; w < width(); ++w) {
 367             output.addValue(
 368               b, c, h, w, getValue(b, c, h, w) * m.getValue(b, c, h, w), beta);
 369           }
 370         }
 371       }
 372     }
 373   } else {
 374     /** @todo optimize this with combining these loops where stride is 1 */
 375     for (unsigned int b = 0; b < batch(); ++b) {
 376       for (unsigned int c = 0; c < channel(); ++c) {
 377         for (unsigned int h = 0; h < height(); ++h) {
 378           float *out_data = output.getAddress(b, c, h, 0);
 379           const float *m_data = m.getAddress(b, c, h, 0);
 380           const float *in_data = getAddress(b, c, h, 0);
 381           std::transform(in_data, in_data + width(), m_data, out_data,
 382                          std::multiplies<float>());
 383         }
 384       }
 385     }
 386   }
 387
 388   return output;
 389 }
 390
 391 int Tensor::add_i_strided(Tensor const &m, const float beta) {
 392   try {
 393     this->add_strided(m, *this, beta);
 394   } catch (std::exception &err) {
 395     ml_loge("%s %s", typeid(err).name(), err.what());
 396     return ML_ERROR_INVALID_PARAMETER;
 397   }
 398
 399   return ML_ERROR_NONE;
 400 }
 401
 402 Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
 403   Tensor t;
 404   return this->add_strided(m, t, beta);
 405 }
 406
 407 Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
 408                             const float beta) const {
 409   /** TODO: throw than create new dimenions */
 410   CREATE_IF_EMPTY_DIMS(output, dim);
 411
 412   if (size() != m.size() || size() != output.size())
 413     throw std::invalid_argument(
 414       "Strided addition does not support broadcasting");
 415
 416   if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
 417       beta != 0.0) {
 418     for (unsigned int b = 0; b < batch(); ++b) {
 419       for (unsigned int c = 0; c < channel(); ++c) {
 420         for (unsigned int h = 0; h < height(); ++h) {
 421           for (unsigned int w = 0; w < width(); ++w) {
 422             output.setValue(
 423               b, c, h, w, getValue(b, c, h, w) + m.getValue(b, c, h, w) * beta);
 424           }
 425         }
 426       }
 427     }
 428   } else {
 429     /** @todo optimize this with combining these loops where stride is 1 */
 430     for (unsigned int b = 0; b < batch(); ++b) {
 431       for (unsigned int c = 0; c < channel(); ++c) {
 432         for (unsigned int h = 0; h < height(); ++h) {
 433           float *out_data = output.getAddress(b, c, h, 0);
 434           const float *m_data = m.getAddress(b, c, h, 0);
 435           const float *in_data = getAddress(b, c, h, 0);
 436           std::transform(in_data, in_data + width(), m_data, out_data,
 437                          std::plus<float>());
 438         }
 439       }
 440     }
 441   }
 442
 443   return output;
 444 }
 445
 446 int Tensor::multiply_i(float const &value) {
 447   NNTR_THROW_IF(!contiguous, std::invalid_argument)
 448     << getName() << " is not contiguous, cannot multiply";
 449
 450   /// @note this is not depending on multiply_i as there is an optimized
 451   /// version for multiply_i
 452   float *data = getData();
 453   unsigned int len = size();
 454
 455   sscal(len, value, data, 1);
 456   return ML_ERROR_NONE;
 457 }
 458
 459 Tensor Tensor::multiply(float const &value) const {
 460   Tensor t;
 461   return multiply(value, t);
 462 }
 463
 464 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
 465   /// @todo add unittest
 466   auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
 467   return apply(f, out);
 468 }
 469
 470 int Tensor::multiply_i(Tensor const &m, const float beta) {
 471   try {
 472     this->multiply(m, *this, beta);
 473   } catch (std::exception &err) {
 474     ml_loge("%s %s", typeid(err).name(), err.what());
 475     return ML_ERROR_INVALID_PARAMETER;
 476   }
 477
 478   return ML_ERROR_NONE;
 479 }
 480
 481 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
 482   Tensor t;
 483   return this->multiply(m, t, beta);
 484 }
 485
 486 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
 487                          const float beta) const {
 488   /**
 489    * @note this does not work correctly with differently strided inputs.
 490    * Use multiply_strided alternatively
 491    */
 492   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 493                float *out_buf) {
 494     if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1 &&
 495         beta == 0.0) {
 496       std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 497                      std::multiplies<float>());
 498     } else {
 499       for (unsigned int i = 0; i < e.buffer_size; ++i) {
 500         *out_buf = *buf * *m_buf + beta * *out_buf;
 501         buf += strides[3];
 502         m_buf += e.strides[3];
 503         out_buf += output.strides[3];
 504       }
 505     }
 506   };
 507
 508   NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 509                 std::invalid_argument)
 510     << getName() << " is not contiguous, cannot multiply";
 511
 512   apply_broadcast(m, f, output);
 513   return output;
 514 }
 515
 516 int Tensor::divide_i(float const &value) {
 517   if (value == 0.0f) {
 518     return ML_ERROR_INVALID_PARAMETER;
 519   }
 520   this->divide(value, *this);
 521   return ML_ERROR_NONE;
 522 }
 523
 524 Tensor Tensor::divide(float const &value) const {
 525   Tensor t;
 526   return divide(value, t);
 527 }
 528
 529 Tensor &Tensor::divide(float const &value, Tensor &out) const {
 530   auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
 531   /// @todo add unittest
 532   if (value == 0.0f) {
 533     std::stringstream ss;
 534     ss << "[Tensor] divide by value failed, value: " << value;
 535     throw std::invalid_argument(ss.str().c_str());
 536   }
 537   return apply(f, out);
 538 }
 539
 540 int Tensor::divide_i(Tensor const &m) {
 541   try {
 542     this->divide(m, *this);
 543   } catch (std::exception &err) {
 544     ml_loge("%s %s", typeid(err).name(), err.what());
 545     return ML_ERROR_INVALID_PARAMETER;
 546   }
 547
 548   return ML_ERROR_NONE;
 549 }
 550
 551 Tensor Tensor::divide(Tensor const &m) const {
 552   Tensor t;
 553   return this->divide(m, t);
 554 }
 555
 556 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
 557   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 558                float *out_buf) {
 559     if (e.strides[3] == 1 && output.strides[3] == 1 && strides[3] == 1) {
 560       std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 561                      std::divides<float>());
 562     } else {
 563       for (unsigned int i = 0; i < e.buffer_size; ++i) {
 564         *out_buf = *buf / *m_buf;
 565         buf += strides[3];
 566         m_buf += e.strides[3];
 567         out_buf += output.strides[3];
 568       }
 569     }
 570   };
 571
 572   NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 573                 std::invalid_argument)
 574     << getName() << " is not contiguous, cannot divide";
 575
 576   apply_broadcast(m, f, output);
 577   return output;
 578 }
 579
 580 int Tensor::add_i(float const &value) {
 581   this->add(value, *this);
 582   return ML_ERROR_NONE;
 583 }
 584
 585 Tensor Tensor::add(float const &value) const {
 586   Tensor t;
 587   return add(value, t);
 588 }
 589
 590 Tensor &Tensor::add(float const &value, Tensor &out) const {
 591   /// @todo add unittest
 592   auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
 593   return apply(f, out);
 594 }
 595
 596 int Tensor::add_i(Tensor const &m, float const alpha) {
 597   /// @todo: add axis rather doing add over the last two dimensions always
 598   /// operator i has optimized version
 599   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 600                float *out_buf) {
 601     saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
 602   };
 603
 604   /// @todo: enable this after add_strided supports broadcast
 605   // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
 606   //   << getName() << " is not contiguous, cannot add";
 607
 608   try {
 609     apply_broadcast(m, f, *this);
 610   } catch (std::exception &err) {
 611     ml_loge("%s %s", typeid(err).name(), err.what());
 612     return ML_ERROR_INVALID_PARAMETER;
 613   }
 614
 615   return ML_ERROR_NONE;
 616 }
 617
 618 Tensor Tensor::add(Tensor const &m, float const alpha) const {
 619   Tensor t;
 620   return this->add(m, t, alpha);
 621 }
 622
 623 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
 624   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
 625                float *out_buf) {
 626     if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 0) {
 627       std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
 628                      std::plus<float>());
 629     } else {
 630       for (unsigned int i = 0; i < e.buffer_size; ++i) {
 631         *out_buf = *buf + *m_buf * alpha;
 632         buf += strides[3];
 633         m_buf += e.strides[3];
 634         out_buf += strides[3];
 635       }
 636     }
 637   };
 638
 639   NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
 640                 std::invalid_argument)
 641     << getName() << " is not contiguous, cannot add";
 642
 643   apply_broadcast(m, f, output);
 644
 645   return output;
 646 }
 647
 648 int Tensor::subtract_i(float const &value) {
 649   this->subtract(value, *this);
 650   return ML_ERROR_NONE;
 651 }
 652
 653 Tensor Tensor::subtract(float const &value) const {
 654   Tensor t;
 655   return subtract(value, t);
 656 }
 657
 658 Tensor &Tensor::subtract(float const &value, Tensor &out) const {
 659   /// @todo add unittest
 660   auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
 661   return apply(f, out);
 662 }
 663
 664 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
 665
 666 Tensor Tensor::subtract(Tensor const &m) const { return add(m, -1); }
 667
 668 Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
 669   return add(m, out, -1);
 670 }
 671
 672 int Tensor::pow_i(float exponent) {
 673   pow(exponent, *this);
 674   return ML_ERROR_NONE;
 675 }
 676
 677 Tensor Tensor::pow(float exponent) const {
 678   Tensor t;
 679   return pow(exponent, t);
 680 }
 681
 682 Tensor &Tensor::pow(float exponent, Tensor &out) const {
 683   auto f = [exponent](float in) { return powf(in, exponent); };
 684   return apply(f, out);
 685 }
 686
 687 Tensor Tensor::getBatchSlice(unsigned int offset, unsigned int size) const {
 688   TensorDim dim_ = dim;
 689   dim_.batch(size);
 690
 691   return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
 692 }
 693
 694 void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
 695                                     unsigned int offset) {
 696   /**
 697    * - If src already has data allocaed, then directly make dest tensor based on
 698    * the src tensor.
 699    * - If src.data does not exist (meaning tensor does not memory allocated),
 700    * and src.src_tensor does not exist (meaning the src tensor does not depened
 701    * on another tensor), then create a SrcSharedTensor around the src.
 702    * - If src.src_tensor exists, then use the src.src_tensor to create the
 703    *  required SrcSharedTensor to avoid recursive dependency.
 704    *
 705    * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
 706    * if the batch size of src is updated and needs reallocation.
 707    */
 708   dest.data = nullptr;
 709   if (src.data) {
 710     dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
 711     dest.allocate();
 712   } else if (!src.src_tensor)
 713     dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
 714   else
 715     dest.src_tensor = std::make_shared<SrcSharedTensor>(
 716       src.src_tensor->tensor(), offset + src.src_tensor->offset());
 717 }
 718
 719 Tensor Tensor::getSharedDataTensor(const TensorDim dim_, unsigned int offset,
 720                                    bool reset_stride,
 721                                    const std::string &name_) const {
 722   Tensor ret = *this;
 723   ret.dim = dim_;
 724   if (!name_.empty())
 725     ret.name = name_;
 726
 727   if (dim_.getDataLen() + offset > dim.getDataLen())
 728     throw std::invalid_argument(
 729       "Creating shared tensor of size bigger than tensor memory.");
 730
 731   if (reset_stride)
 732     ret.strides = ret.dim.computeStrides();
 733
 734   TensorDim new_match_dim = dim_;
 735   new_match_dim.batch(dim.batch());
 736   if (new_match_dim != dim && !reset_stride)
 737     ret.contiguous = false;
 738
 739   /**
 740    * In this case, its the caller's responsibility to ensure that allocate() is
 741    * called for the output tensor before operating on the output tensor.
 742    */
 743   createSharedDataTensor(*this, ret, offset);
 744
 745   return ret;
 746 }
 747
 748 std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
 749
 750   NNTR_THROW_IF(num_size == 0, std::invalid_argument)
 751     << "num size cannot be zero";
 752
 753   if (axis == -1) {
 754     axis = 3;
 755   }
 756
 757   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
 758     << "cannot split axis of axis: " << axis;
 759
 760   NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
 761     << "axis is not divisible by num_size, axis: " << axis
 762     << " num size: " << num_size;
 763
 764   auto ret_dim = dim;
 765   auto new_dim = dim.getTensorDim(axis) / num_size;
 766   ret_dim.setTensorDim(axis, new_dim);
 767
 768   auto iter_value = [this, &ret_dim](std::array<unsigned, 4> &loc) -> float & {
 769     auto &value = getValue(loc[0], loc[1], loc[2], loc[3]);
 770     for (int i = 3; i >= 0; --i) {
 771       loc[i]++;
 772       if (loc[i] % ret_dim.getTensorDim(i) == 0) {
 773         loc[i] -= ret_dim.getTensorDim(i);
 774         continue;
 775       }
 776       break;
 777     }
 778     return value;
 779   };
 780
 781   std::vector<Tensor> ret;
 782   ret.reserve(num_size);
 783
 784   for (unsigned int i = 0; i < num_size; ++i) {
 785     std::array<unsigned, 4> loc = {0, 0, 0, 0};
 786     loc[axis] = new_dim * i;
 787     ret.emplace_back(ret_dim);
 788     auto &ret_t = ret.back();
 789
 790     ret_t.apply_i([&iter_value, &loc](float _) { return iter_value(loc); });
 791   }
 792
 793   return ret;
 794 }
 795
 796 Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
 797
 798   if (axis == -1) {
 799     axis = 3;
 800   }
 801
 802   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
 803     << "cannot split axis of axis: " << axis;
 804
 805   NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
 806     << "given tensor vector is empty";
 807
 808   auto ref_dim = tensors.front().getDim();
 809   ref_dim.setTensorDim(axis, 1);
 810   NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
 811                              [&ref_dim, axis](const Tensor &t) {
 812                                auto cur_dim = t.getDim();
 813                                cur_dim.setTensorDim(axis, 1);
 814                                return ref_dim == cur_dim;
 815                              }),
 816                 std::invalid_argument)
 817     << " all tensor must have the same dimension except for the axis, ref_dim: "
 818     << ref_dim << " axis : " << axis;
 819
 820   auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
 821                                   [axis](unsigned cur, const Tensor &t) {
 822                                     return cur += t.getDim().getTensorDim(axis);
 823                                   });
 824   auto iter_value = [](std::array<unsigned, 4> &loc,
 825                        std::array<unsigned, 4> &start_loc, Tensor &t,
 826                        const TensorDim &ref_dim) -> float & {
 827     auto &value = t.getValue(loc[0], loc[1], loc[2], loc[3]);
 828     for (int i = 3; i >= 0; --i) {
 829       loc[i]++;
 830       if (loc[i] - start_loc[i] == ref_dim.getTensorDim(i)) {
 831         loc[i] = start_loc[i];
 832         continue;
 833       }
 834       break;
 835     }
 836     return value;
 837   };
 838
 839   auto ret_dim = ref_dim;
 840   ret_dim.setTensorDim(axis, axis_dim);
 841
 842   auto ret = Tensor(ret_dim);
 843
 844   std::array<unsigned, 4> loc = {0, 0, 0, 0};
 845   for (auto &t : tensors) {
 846     std::array<unsigned, 4> start_loc = loc;
 847     for (auto i = 0u, sz = t.size(); i < sz; ++i) {
 848       iter_value(loc, start_loc, ret, t.getDim()) = t.getValue(i);
 849     }
 850     loc[axis] += t.getDim().getTensorDim(axis);
 851   }
 852
 853   return ret;
 854 }
 855
 856 void Tensor::makeSharedDataTensor(const Tensor &src, unsigned int offset) {
 857   if (strides != src.strides)
 858     throw std::invalid_argument(
 859       "Creating shared tensor of different stride than source tensor.");
 860
 861   if (getDim().getDataLen() + offset > src.getDim().getDataLen())
 862     throw std::invalid_argument(
 863       "Creating shared tensor of different size or stride than source tensor.");
 864
 865   /**
 866    * In this case, its the caller's responsibility to ensure that allocate() is
 867    * called for the output tensor before operating on the output tensor.
 868    */
 869   createSharedDataTensor(src, *this, offset);
 870 }
 871
 872 void Tensor::apply_broadcast(
 873   Tensor const &m,
 874   std::function<void(const BroadcastInfo &e, const float *, const float *,
 875                      float *)>
 876     v_func,
 877   Tensor &output) const {
 878   CREATE_IF_EMPTY_DIMS(output, dim);
 879
 880   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
 881     << getName() << " is not allocated";
 882   NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
 883     << m.getName() << " is not allocated";
 884   NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
 885     << output.getName() << " is not allocated";
 886
 887   /// shortcut to cover when dimension matches
 888   /// note that buffer_size, the last stride is only used in v_func but it
 889   /// might be changed
 890   if (dim == m.dim) {
 891     BroadcastInfo e;
 892     e.buffer_size = size();
 893     e.strides[3] = 1;
 894     v_func(e, getData(), m.getData(), output.getData());
 895     return;
 896   }
 897
 898   return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
 899 }
 900
 901 void Tensor::apply_broadcast_util(
 902   Tensor const &m,
 903   std::function<void(const BroadcastInfo &e, const float *, const float *,
 904                      float *)>
 905     v_func,
 906   Tensor &output, const BroadcastInfo &e, int cur_axis, unsigned int offset,
 907   unsigned int m_offset) const {
 908
 909   const float *buf = this->getData();
 910   const float *m_buf = m.getData();
 911   float *out_buf = output.getData();
 912
 913   if (e.buffer_axis == cur_axis) {
 914     v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
 915     return;
 916   }
 917
 918   cur_axis++;
 919   for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
 920     unsigned int next_offset = offset + i * strides[cur_axis];
 921     unsigned int next_m_offset = m_offset + i * e.strides[cur_axis];
 922     apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
 923                          next_m_offset);
 924   }
 925 }
 926
 927 /**
 928  * This is to sum the Tensor data according to the dim.batch().
 929  * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
 930  */
 931 Tensor Tensor::sum_by_batch() const {
 932   NNTR_THROW_IF(!contiguous, std::invalid_argument)
 933     << getName() << " is not contiguous, cannot sum";
 934
 935   Tensor ret(dim.batch(), 1, 1, 1);
 936   unsigned int feat_len = dim.getFeatureLen();
 937   unsigned int batch = dim.batch();
 938
 939   const float *data = getData();
 940   float *rdata = ret.getData();
 941
 942   Tensor ones(1, 1, 1, feat_len);
 943   ones.setValue(1.0);
 944   sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
 945         ones.getData(), 1, 0.0, rdata, 1);
 946
 947   return ret;
 948 }
 949
 950 /**
 951  * @brief Calculate sum according to the axis.
 952  */
 953 Tensor Tensor::sum(unsigned int axis, float alpha) const {
 954   Tensor ret;
 955   return sum(axis, ret, alpha, 0);
 956 }
 957 Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
 958                     float beta) const {
 959   const float *data = getData();
 960
 961   NNTR_THROW_IF(!contiguous, std::invalid_argument)
 962     << getName() << " is not contiguous, cannot sum";
 963
 964   if (axis >= 4)
 965     throw std::out_of_range("Error: axis is invalid");
 966
 967   if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
 968     CREATE_IF_EMPTY_DIMS(ret, dim);
 969     ret.copy(this->getData());
 970     return ret;
 971   }
 972
 973   switch (axis) {
 974   case 0: {
 975     CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width());
 976     unsigned int feat_len = dim.getFeatureLen();
 977     unsigned int batch = dim.batch();
 978     Tensor ones(1, 1, 1, batch);
 979     ones.setValue(alpha);
 980     sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
 981           ones.getData(), 1, beta, ret.getData(), 1);
 982   } break;
 983   case 1: {
 984     CREATE_IF_EMPTY_DIMS(ret, dim.batch(), 1, dim.height(), dim.width());
 985     unsigned int feat_len = dim.height() * dim.width();
 986     unsigned int channel = dim.channel();
 987     Tensor ones(1, 1, 1, channel);
 988     ones.setValue(alpha);
 989     float *rdata = ret.getData();
 990     for (unsigned int k = 0; k < dim.batch(); ++k) {
 991       sgemv(CblasRowMajor, CblasTrans, channel, feat_len, 1,
 992             &data[k * dim.getFeatureLen()], feat_len, ones.getData(), 1, beta,
 993             &rdata[k * feat_len], 1);
 994     }
 995   } break;
 996   case 2: {
 997     CREATE_IF_EMPTY_DIMS(ret, dim.batch(), dim.channel(), 1, dim.width());
 998     unsigned int width = dim.width();
 999     unsigned int height = dim.height();
1000     Tensor ones(1, 1, 1, height);
1001     ones.setValue(alpha);
1002     float *rdata = ret.getData();
1003     for (unsigned int k = 0; k < dim.batch(); ++k) {
1004       for (unsigned int c = 0; c < dim.channel(); ++c) {
1005         unsigned int idx =
1006           k * dim.getFeatureLen() + c * dim.width() * dim.height();
1007         unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim.width();
1008         sgemv(CblasRowMajor, CblasTrans, height, width, 1, &data[idx], width,
1009               ones.getData(), 1, beta, &rdata[ridx], 1);
1010       }
1011     }
1012   } break;
1013   case 3: {
1014     CREATE_IF_EMPTY_DIMS(ret, dim.batch(), dim.channel(), dim.height(), 1);
1015     unsigned int m = ret.dim.getDataLen();
1016     unsigned int n = dim.width();
1017     Tensor ones(1, 1, 1, n);
1018     ones.setValue(alpha);
1019     sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n, ones.getData(), 1,
1020           beta, ret.getData(), 1);
1021   } break;
1022   default:
1023     throw std::out_of_range("Error: Dimension cannot exceed 3");
1024   }
1025   return ret;
1026 }
1027
1028 Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
1029   Tensor ret;
1030   return sum(axes, ret, alpha);
1031 }
1032
1033 void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
1034   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1035     << getName() << " is not contiguous, cannot merge axis";
1036
1037   if (axis2 != axis1 + 1)
1038     throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
1039
1040   dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
1041   dim.setTensorDim(axis1, 1);
1042 }
1043
1044 Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
1045                     float alpha) const {
1046   if (axes.empty())
1047     throw std::invalid_argument("empty axes given");
1048
1049   if (axes.size() == 1) {
1050     this->sum(axes[0], output, alpha);
1051   } else {
1052     /** club axes together */
1053     Tensor new_reshaped = *this;
1054     std::vector<unsigned int> new_axes = {axes[0]};
1055     for (unsigned int i = 1; i < axes.size(); ++i) {
1056       if (axes[i] == axes[i - 1] + 1) {
1057         new_reshaped.mergeAxis(axes[i - 1], axes[i]);
1058         new_axes.back() = axes[i];
1059       } else {
1060         new_axes.push_back(axes[i]);
1061       }
1062     }
1063
1064     Tensor ret = new_reshaped.sum(new_axes[0]);
1065     for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
1066       ret = ret.sum(axes[i]);
1067     ret.sum(new_axes.back(), output, alpha);
1068   }
1069
1070   return output;
1071 }
1072
1073 Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
1074                            bool trans_m, float beta) const {
1075   if (!result.isAllocated())
1076     throw std::invalid_argument(
1077       "Output tensor must be preallocated for dotBatched operation");
1078   for (unsigned int b = 0; b < batch(); b++) {
1079     /** @todo try using transpose to speedup the operation */
1080     const Tensor this_b = this->getBatchSlice(b, 1);
1081     Tensor m_b = m.getBatchSlice(b, 1);
1082     Tensor result_b = result.getBatchSlice(b, 1);
1083
1084     this_b.dot(m_b, result_b, trans, trans_m, beta);
1085   }
1086
1087   return result;
1088 }
1089
1090 Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
1091   Tensor output;
1092   dot(m, output, trans, trans_m);
1093
1094   return output;
1095 }
1096 /**
1097  * @brief compute the derivative of this in the current tensor
1098  * @todo will have to see if beta effects this computation
1099  */
1100 Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
1101                                 bool trans, bool trans_m, float beta) {
1102   bool deriv_trans_m = true;
1103   bool deriv_trans = false;
1104   /** @todo handle all cases of trans and trans_m */
1105   if (!trans && trans_m) {
1106     deriv_trans_m = false;
1107   }
1108
1109   return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
1110 }
1111
1112 /**
1113  * @brief compute the derivative wrt m in the m tensor
1114  * @note The caller tensor must be the same tensor as the one which called the
1115  * dot() product.
1116  */
1117 Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
1118                                 bool trans, bool trans_m, float beta) const {
1119   bool deriv_trans_m = false;
1120   bool deriv_trans = true;
1121   /** @todo handle all cases of trans and trans_m */
1122
1123   if (!trans && trans_m) {
1124     output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1125     return m_deriv;
1126   } else {
1127     return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1128   }
1129 }
1130
1131 Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
1132                                         Tensor const &output_deriv, bool trans,
1133                                         bool trans_m, float beta) {
1134   bool deriv_trans_m = true;
1135   bool deriv_trans = false;
1136   /** @todo handle all cases of trans and trans_m */
1137   if (!trans && trans_m) {
1138     deriv_trans_m = false;
1139   }
1140
1141   return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
1142 }
1143
1144 Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
1145                                         Tensor const &output_deriv, bool trans,
1146                                         bool trans_m, float beta) const {
1147   bool deriv_trans_m = false;
1148   bool deriv_trans = true;
1149   /** @todo handle all cases of trans and trans_m */
1150
1151   if (!trans && trans_m) {
1152     output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
1153     return m_deriv;
1154   } else {
1155     return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
1156   }
1157 }
1158
1159 /**
1160  * @note: This dot product flattens the fist 3 axis for the purpose of
1161  * computation. So, while performing, these matrices are behaving as 2-D
1162  * matrices. The dimensions are restored while returning back the tensor
1163  * in case of trans is false.
1164  */
1165 Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
1166                     float beta) const {
1167   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1168     << getName() << " is not contiguous. Cannot dot product.";
1169
1170   // Comment out with intension to support the calculation wrt. batch and height
1171   // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
1172   // [BxCxH,W] as well if (m.dim.rank() > 2) {
1173   //   throw exception::not_supported("Error: support only for rank of dot "
1174   //                                  "matrix <= 2");
1175   // }
1176
1177   // Comment out with intension to support the calculation wrt. batch and height
1178   // direction of this tensor. It is OK as long as m is 2D
1179   //
1180   if (trans && dim.rank() > 2) {
1181     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
1182   }
1183
1184   unsigned int dim1 = batch() * channel() * height();
1185   unsigned int dim2 = width();
1186   unsigned int mdim1 = m.batch() * m.channel() * m.height();
1187   unsigned int mdim2 = m.width();
1188
1189   unsigned int M, N, K, lda, ldb, ldc;
1190
1191   if (!trans && !trans_m) {
1192     if (dim2 != mdim1)
1193       throw std::runtime_error(
1194         "Error: incompatible dimensions for dot product");
1195     K = mdim1; /** == dim2 */
1196     N = mdim2;
1197     M = dim1;
1198     CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N);
1199
1200     // We are not set zero the result because of performnace reason.
1201     // However, result is not initialized properly. There might include
1202     // garbage like nan. When we have to use this value as in C = alpha*A*B +
1203     // beta*C, then have to check gargabe data of C is not effect or not.
1204
1205   } else if (!trans && trans_m) {
1206     if (dim2 != mdim2)
1207       throw std::runtime_error(
1208         "Error: incompatible dimensions for dot product");
1209     K = mdim2; /** == dim2 */
1210     N = mdim1;
1211     M = dim1;
1212     CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N);
1213   } else if (trans && !trans_m) {
1214     if (dim1 != mdim1)
1215       throw std::runtime_error(
1216         "Error: incompatible dimensions for dot product");
1217     K = mdim1; /** == dim1 */
1218     N = mdim2;
1219     M = dim2;
1220     CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N);
1221   } else {
1222     if (dim1 != mdim2)
1223       throw std::runtime_error(
1224         "Error: incompatible dimensions for dot product");
1225     K = mdim2; /** == dim1 */
1226     N = mdim1;
1227     M = dim2;
1228     CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N);
1229   }
1230   lda = dim2;
1231   ldb = mdim2;
1232   ldc = result.width();
1233
1234   const float *data = getData();
1235   const float *mdata = m.getData();
1236   float *rdata = result.getData();
1237   const float alpha = 1.0f;
1238   enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
1239   enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
1240
1241   /// shortcut handling in case of vector
1242   /// for vector, (1 * K) == (K * 1) in current memory layout...
1243   /// and plaese note that N, K, M is a fixed place holder after considering
1244   /// transpose.
1245   /// For example, there is no case like (1 * K) X (1 * K) while
1246   /// (1 * K) X (1 * M) can be a case
1247   /// case1: (1 * K) X (K * 1)
1248   if (M == 1 && N == 1) {
1249     *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
1250   }
1251   /// case2: (M * K) X (K * 1)
1252   else if (N == 1) {
1253     sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
1254           rdata, 1);
1255   }
1256   /// case3: (1 * K) X (K * N) = 1 * N = R
1257   /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
1258   /// Effectively a translation of sgemv
1259   else if (M == 1) {
1260     transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
1261     sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1, beta,
1262           rdata, 1);
1263   }
1264   /// case others: use gemm
1265   else {
1266     sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata, ldb,
1267           beta, rdata, ldc);
1268   }
1269
1270   return result;
1271 }
1272
1273 Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
1274   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1275     << getName() << " is not contiguous. Cannot transpose.";
1276
1277   if (out.getData() == getData()) {
1278     Tensor tmp = clone();
1279     return tmp.transpose(direction, out);
1280   }
1281
1282   unsigned int SL, SI, SJ, SK;
1283   const float *inptr;
1284   float *outptr;
1285
1286   out.reshape(dim.transpose(direction));
1287
1288   int indexI = direction[0] - '0';
1289   int indexJ = direction[2] - '0';
1290
1291   SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
1292
1293   inptr = getData();
1294   outptr = out.getData();
1295
1296   switch (indexI) {
1297   case 0:
1298     if (indexJ == 1) {
1299       transposeloop(l, i, j, k, SL, SI, SJ, SK);
1300     } else {
1301       transposeloop(l, i, k, j, SL, SI, SK, SJ);
1302     }
1303     break;
1304   case 1:
1305     if (indexJ == 0) {
1306       transposeloop(l, j, i, k, SL, SJ, SI, SK);
1307     } else {
1308       transposeloop(l, j, k, i, SL, SJ, SK, SI);
1309     }
1310     break;
1311   case 2:
1312     if (indexJ == 0) {
1313       transposeloop(l, k, i, j, SL, SK, SI, SJ);
1314     } else {
1315       transposeloop(l, k, j, i, SL, SK, SJ, SI);
1316     }
1317     break;
1318   }
1319
1320   return out;
1321 }
1322
1323 Tensor Tensor::transpose(const std::string &direction) const {
1324   Tensor result(dim);
1325   transpose(direction, result);
1326   return result;
1327 }
1328
1329 Tensor Tensor::dropout_mask(float dropout) const {
1330   Tensor result(dim);
1331   result.dropout_mask(dropout);
1332   return result;
1333 }
1334
1335 void Tensor::dropout_mask(float dropout) {
1336   setRandUniform(0.0, 1.0);
1337   float scale = 1.0 / (1 - dropout);
1338   float *data_ = getData();
1339
1340   for (unsigned int i = 0; i < size(); ++i) {
1341     if (data_[i] >= dropout)
1342       data_[i] = scale;
1343     else
1344       data_[i] = 0.0;
1345   }
1346 }
1347
1348 void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
1349   float fill_mask_val = 0.0;
1350   float en_mask_val = 1.0 - fill_mask_val;
1351
1352   if (reverse) {
1353     fill_mask_val = 1.0;
1354     en_mask_val = 1.0 - fill_mask_val;
1355   }
1356
1357   setValue(fill_mask_val);
1358   if (mask_len.batch() != batch())
1359     throw std::invalid_argument("Number of filter masks mismatched");
1360
1361   for (unsigned int b = 0; b < batch(); b++) {
1362     float *addr = getAddress(b, 0, 0, 0);
1363     const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
1364     std::fill(addr, addr + (*mask_len_val), en_mask_val);
1365   }
1366 }
1367
1368 Tensor Tensor::zoneout_mask(float zoneout) {
1369   Tensor ret(getDim());
1370   zoneout_mask(ret, zoneout);
1371   return ret;
1372 }
1373
1374 void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
1375   if (dim != opposite.dim) {
1376     throw std::invalid_argument(
1377       "[Tensor::zoneout_mask] opposite dimension does not match");
1378   }
1379
1380   opposite.setRandBernoulli(zoneout);
1381   float *data = getData();
1382   float *opposite_data = opposite.getData();
1383
1384   for (unsigned int i = 0; i < size(); ++i) {
1385     if (opposite_data[i] > epsilon) {
1386       data[i] = 0.0f;
1387     } else {
1388       data[i] = 1.0f;
1389     }
1390   }
1391 }
1392
1393 int Tensor::apply_i(std::function<float(float)> f) {
1394   Tensor result = *this;
1395   apply(f, result);
1396
1397   return ML_ERROR_NONE;
1398 }
1399
1400 Tensor Tensor::apply(std::function<float(float)> f) const {
1401   Tensor result;
1402   return apply(f, result);
1403 }
1404
1405 Tensor &Tensor::apply(std::function<float(float)> f, Tensor &output) const {
1406   CREATE_IF_EMPTY_DIMS(output, dim);
1407
1408   if (dim != output.dim) {
1409     /// @todo add unittest
1410     throw std::invalid_argument(
1411       "[Tensor::apply] output dimension does not match");
1412   }
1413
1414   if (contiguous && output.contiguous) {
1415     const float *data = getData();
1416     float *rdata = output.getData();
1417     std::transform(data, data + size(), rdata, f);
1418   } else if (strides[3] == 1 && output.strides[3] == 1) {
1419     /** @todo optimize this with combining these loops where stride is 1 */
1420     for (unsigned int b = 0; b < batch(); ++b) {
1421       for (unsigned int c = 0; c < channel(); ++c) {
1422         for (unsigned int h = 0; h < height(); ++h) {
1423           float *out_data = output.getAddress(b, c, h, 0);
1424           const float *in_data = getAddress(b, c, h, 0);
1425           std::transform(in_data, in_data + width(), out_data, f);
1426         }
1427       }
1428     }
1429   } else {
1430     for (unsigned int b = 0; b < batch(); ++b) {
1431       for (unsigned int c = 0; c < channel(); ++c) {
1432         for (unsigned int h = 0; h < height(); ++h) {
1433           for (unsigned int w = 0; w < width(); ++w) {
1434             output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
1435           }
1436         }
1437       }
1438     }
1439   }
1440
1441   return output;
1442 }
1443
1444 Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
1445
1446 Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
1447                       Tensor &output) const {
1448   return f(*this, output);
1449 }
1450
1451 void Tensor::print(std::ostream &out) const {
1452   printInstance(out, this);
1453   const float *data = getData();
1454
1455   unsigned int len = size();
1456   out << "data addr: " << data << '\n';
1457   out << dim;
1458
1459   if (len > 100) {
1460     out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
1461         << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
1462         << std::endl;
1463     return;
1464   }
1465
1466   std::ios init(NULL);
1467   init.copyfmt(out);
1468   for (unsigned int k = 0; k < dim.batch(); k++) {
1469     for (unsigned int l = 0; l < dim.channel(); l++) {
1470       for (unsigned int i = 0; i < dim.height(); i++) {
1471         for (unsigned int j = 0; j < dim.width(); j++) {
1472           out << std::setw(10) << std::setprecision(10)
1473               << this->getValue(k, l, i, j) << " ";
1474         }
1475         out << std::endl;
1476       }
1477       out << std::endl;
1478     }
1479     out << "-------" << std::endl;
1480   }
1481   out.copyfmt(init);
1482 }
1483
1484 std::ostream &operator<<(std::ostream &out, Tensor const &m) {
1485   m.print(out);
1486   return out;
1487 }
1488
1489 void Tensor::copy(const float *buf) {
1490   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1491     << getName() << "Tensor is not contiguous, cannot copy.";
1492
1493   if (buf == getData()) {
1494     return;
1495   }
1496
1497   scopy(size(), buf, 1, getData(), 1);
1498 }
1499
1500 void Tensor::copy_with_stride(const Tensor &from) {
1501
1502   if (dim == from.getDim()) {
1503     for (unsigned int b = 0; b < batch(); ++b) {
1504       for (unsigned int c = 0; c < channel(); ++c) {
1505         for (unsigned int h = 0; h < height(); ++h) {
1506           for (unsigned int w = 0; w < width(); ++w) {
1507             setValue(b, c, h, w, from.getValue(b, c, h, w));
1508           }
1509         }
1510       }
1511     }
1512   } else {
1513     Tensor t = Tensor(from.getDim(), true);
1514     for (unsigned int b = 0; b < t.batch(); ++b) {
1515       for (unsigned int c = 0; c < t.channel(); ++c) {
1516         for (unsigned int h = 0; h < t.height(); ++h) {
1517           for (unsigned int w = 0; w < t.width(); ++w) {
1518             t.setValue(b, c, h, w, from.getValue(b, c, h, w));
1519           }
1520         }
1521       }
1522     }
1523     swap(t, *this);
1524   }
1525 }
1526
1527 void Tensor::copy(const Tensor &from) {
1528   // todo: enable copy to non-contiguous tensor
1529   if (!contiguous) {
1530     throw std::runtime_error("Cannot copy non-contiguous tensor");
1531   }
1532
1533   if (from.size() != 0 && size() == from.size()) {
1534     reshape(from.getDim());
1535     copy(from.getData());
1536   } else {
1537     Tensor t = Tensor(from.getDim(), from.getData());
1538     swap(t, *this);
1539   }
1540 }
1541
1542 void Tensor::copyData(const Tensor &from) {
1543   // todo: enable copy to non-contiguous tensor
1544   if (!contiguous) {
1545     throw std::runtime_error("Cannot copy non-contiguous tensor");
1546   }
1547
1548   if (size() != from.size())
1549     throw std::invalid_argument("Size of tensor to copy must match");
1550   copy(from.getData());
1551 }
1552
1553 Tensor Tensor::clone() const {
1554   Tensor t;
1555   t.copy(*this);
1556   t.name = name;
1557   return t;
1558 }
1559
1560 void Tensor::reshape(const TensorDim &d) {
1561
1562   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1563     << getName() << " is not contiguous, cannot reshape.";
1564
1565   NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
1566     << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
1567        "\nfrom "
1568     << getDim() << " to " << d;
1569
1570   dim = d;
1571   strides = d.computeStrides();
1572 }
1573
1574 void Tensor::fill(const Tensor &from, bool alloc) {
1575   if (alloc && this->empty()) {
1576     this->copy(from);
1577     return;
1578   }
1579
1580   if (!from.contiguous || !contiguous) {
1581     /// @todo enable this if needed
1582     throw nntrainer::exception::not_supported(
1583       "[Tensor::fill] non-contiguous tensors are not supported");
1584   }
1585
1586   if (dim != from.getDim()) {
1587     throw std::invalid_argument("[Tensor::fill] dimension must be the same");
1588   }
1589
1590   if (strides != from.getStrides()) {
1591     /// @todo length does not represent buffer size, there should be way to get
1592     /// the buffer size
1593     throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
1594   }
1595
1596   this->copy(from.getData());
1597 }
1598
1599 void Tensor::save(std::ostream &file) {
1600   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1601     << getName() << " is not contiguous, cannot save.";
1602
1603   std::streamsize sz = static_cast<std::streamsize>(bytes());
1604   NNTR_THROW_IF(sz < 0, std::invalid_argument)
1605     << "save size: " << bytes()
1606     << " is too big. It cannot be represented by std::streamsize";
1607
1608   checkedWrite(file, (char *)getData(), sz, "[Tensor::save] operation failed");
1609   putData();
1610 }
1611
1612 void Tensor::read(std::ifstream &file) {
1613   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1614     << getName() << " is not contiguous, cannot read.";
1615
1616   std::streamsize sz = static_cast<std::streamsize>(bytes());
1617
1618   NNTR_THROW_IF(sz < 0, std::invalid_argument)
1619     << "read size: " << bytes()
1620     << " is too big. It cannot be represented by std::streamsize";
1621
1622   checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
1623   putData();
1624 }
1625
1626 /**
1627  * @brief Calculate average value according to the axis.
1628  */
1629 Tensor Tensor::average(unsigned int axis) const {
1630   Tensor t;
1631   return average(axis, t);
1632 }
1633
1634 /**
1635  * @brief Calculate average value according to the axis.
1636  */
1637 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
1638   if (axis >= TensorDim::MAXDIM)
1639     throw std::out_of_range(
1640       "negative axis or axis more then MAXDIM is invalid");
1641
1642   unsigned int axis_size = dim.getDim()[axis];
1643   if (axis_size == 1)
1644     output.copy(*this);
1645   else
1646     this->sum(axis, output, 1.0 / ((float)axis_size));
1647
1648   return output;
1649 }
1650
1651 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
1652   Tensor t;
1653   return average(axes, t);
1654 }
1655
1656 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
1657                         Tensor &output) const {
1658   if (axes.empty())
1659     return this->average(output);
1660
1661   TensorDim ret_shape;
1662   for (const auto &idx : axes) {
1663     if (idx >= TensorDim::MAXDIM) {
1664       throw std::out_of_range("axis more then MAXDIM is invalid");
1665     }
1666     ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
1667   }
1668
1669   return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
1670 }
1671
1672 /**
1673  * @brief Calculate average value according to the axis.
1674  */
1675 Tensor Tensor::average() const {
1676   Tensor result = *this;
1677   result.reshape({1, 1, 1, dim.getDataLen()});
1678   return result.average(3);
1679 }
1680
1681 /**
1682  * @brief Calculate average value according to the axis.
1683  */
1684 Tensor &Tensor::average(Tensor &output) const {
1685   Tensor result = *this;
1686   result.reshape({1, 1, 1, dim.getDataLen()});
1687   return result.average(3, output);
1688 }
1689
1690 void Tensor::setValue(float val) {
1691   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1692     << getName() << " is not contiguous, cannot set value.";
1693
1694   float *data = getData();
1695   std::fill(data, data + size(), val);
1696 }
1697
1698 void Tensor::setZero() {
1699   if (contiguous)
1700     sscal(size(), 0, getData(), 1);
1701   else
1702     apply_i([](float val) -> float { return 0; });
1703 }
1704
1705 std::vector<unsigned int> Tensor::argmax() const {
1706   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1707     << getName() << " is not contiguous, cannot get argmax.";
1708
1709   const float *data = getData();
1710   std::vector<unsigned int> result;
1711   unsigned int batch_size = batch();
1712   unsigned int feature_len = dim.getFeatureLen();
1713
1714   result.resize(batch_size);
1715
1716   for (unsigned int b = 0; b < batch_size; b++) {
1717     auto max_iter =
1718       std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
1719     result[b] = std::distance(data, max_iter) - (b * feature_len);
1720   }
1721
1722   return result;
1723 }
1724
1725 float Tensor::l2norm() const {
1726   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1727     << getName() << " is not contiguous, cannot get l2norm.";
1728
1729   unsigned int len = size();
1730   const float *data = getData();
1731
1732   return snrm2(len, data, 1);
1733 }
1734
1735 float Tensor::max_abs() const {
1736   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1737     << getName() << " is not contiguous, cannot get max_abs.";
1738
1739   unsigned int len = size();
1740   const float *data = getData();
1741
1742   unsigned int idx = isamax(len, data, 1);
1743   return *(data + idx);
1744 }
1745
1746 Tensor &Tensor::normalization(Tensor &output) const {
1747   if (output.empty())
1748     output = Tensor(dim);
1749
1750   output.copy(*this);
1751   output.normalization_i();
1752
1753   return output;
1754 }
1755
1756 void Tensor::normalization_i() {
1757   NNTR_THROW_IF(!contiguous, std::invalid_argument)
1758     << getName() << " is not contiguous, cannot do normalization.";
1759
1760   const float *data = getData();
1761
1762   auto bounds = std::minmax_element(data, data + size());
1763   const float min = *bounds.first;
1764   const float max = *bounds.second;
1765
1766   if (max == min) {
1767     Tensor tmp = *this;
1768     this->subtract_i(tmp);
1769   } else {
1770     this->subtract_i(min);
1771     this->divide_i(max - min);
1772   }
1773 }
1774
1775 LazyTensor Tensor::chain() const { return LazyTensor(*this); }
1776
1777 Tensor &Tensor::standardization(Tensor &output) const {
1778   if (output.empty())
1779     output = Tensor(dim);
1780
1781   output.copy(*this);
1782   output.standardization_i();
1783
1784   return output;
1785 }
1786
1787 void Tensor::standardization_i() {
1788   Tensor mean_by_batch = this->sum_by_batch();
1789   mean_by_batch.divide_i(dim.getFeatureLen());
1790
1791   this->subtract_i(mean_by_batch);
1792
1793   Tensor std_dev_by_batch(dim.batch(), 1, 1, 1);
1794   std_dev_by_batch.setZero();
1795   float *std_dev = std_dev_by_batch.getData();
1796
1797   for (unsigned int k = 0; k < dim.batch(); ++k) {
1798     Tensor sub_this = this->getBatchSlice(k, 1);
1799     std_dev[k] = sub_this.l2norm();
1800   }
1801
1802   std_dev_by_batch.divide_i(dim.getFeatureLen());
1803   this->divide_i(std_dev_by_batch);
1804 }
1805
1806 Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
1807   if (m.size() > this->size())
1808     throw exception::not_supported("broadcasting *this is not supported");
1809
1810   const TensorDim m_dim = m.getDim();
1811
1812   BroadcastInfo e;
1813
1814   /// checking if given Tensor's can be broadcasted
1815   for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
1816     if (dim.getTensorDim(i) == m_dim.getTensorDim(i)) {
1817       e.strides[i] = m.strides[i];
1818       continue;
1819     }
1820
1821     /// If given dimension is 1, it could be reused, the stride remaining 0
1822     /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
1823     /// If so, strides should not change
1824     if (m_dim.getTensorDim(i) == 1) {
1825       continue;
1826     }
1827
1828     std::stringstream ss;
1829     ss << "[computeBroadcastInfo] broadcasting only allowed for "
1830           "dimension value of 1 \n"
1831        << "this: " << dim << "target: " << m_dim;
1832     throw std::invalid_argument(ss.str().c_str());
1833   }
1834
1835   /// calculate inner loop size
1836   e.buffer_size = 1;
1837   e.buffer_axis = -1;
1838   e.strides[3] = m.strides[3];
1839
1840   /// initiate buffer info with matching dimension strategy
1841   for (int axis = 3; axis >= 0; --axis) {
1842     if (dim.getTensorDim(axis) != m_dim.getTensorDim(axis)) {
1843       e.buffer_axis = axis;
1844       break;
1845     }
1846
1847     e.buffer_size *= dim.getTensorDim(axis);
1848   }
1849
1850   /// check strategy that uses consecutive ones
1851   if (m_dim.getTensorDim(3) == 1) {
1852     unsigned int inner_loop_size = 1;
1853     int axis;
1854     for (axis = 3; axis >= 0; --axis) {
1855       if (m_dim.getTensorDim(axis) != 1) {
1856         break;
1857       }
1858
1859       inner_loop_size *= dim.getTensorDim(axis);
1860     }
1861
1862     /// if consecutive-one strategy has bigger chunk size, replace the
1863     /// information
1864     if (inner_loop_size > e.buffer_size) {
1865       e.buffer_axis = axis;
1866       e.buffer_size = inner_loop_size;
1867       e.strides[3] = 0;
1868     }
1869   }
1870
1871   return e;
1872 }
1873
1874 } /* namespace nntrainer */