nntrainer/graph/network_graph.cpp

   1 // SPDX-License-Identifier: Apache-2.0
   2 /**
   3  * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
   4  *
   5  * @file    network_graph.h
   6  * @date    19 Oct 2020
   7  * @see     https://github.com/nnstreamer/nntrainer
   8  * @author  Jijoong Moon <jijoong.moon@samsung.com>
   9  * @bug     No known bugs except for NYI items
  10  * @brief   This is Network Graph Class for Neural Network
  11  *
  12  * @todo    Support multi-input graph.
  13  */
  14
  15 #include "tensor.h"
  16 #include <cmath>
  17 #include <stdexcept>
  18 #include <string>
  19
  20 #include <activation_layer.h>
  21 #include <addition_layer.h>
  22 #include <bn_layer.h>
  23 #include <concat_layer.h>
  24 #include <connection.h>
  25 #include <cross_entropy_loss_layer.h>
  26 #include <cross_entropy_sigmoid_loss_layer.h>
  27 #include <cross_entropy_softmax_loss_layer.h>
  28 #include <flatten_layer.h>
  29 #include <grucell.h>
  30 #include <identity_layer.h>
  31 #include <input_layer.h>
  32 #include <layer_node.h>
  33 #include <layer_normalization_layer.h>
  34 #include <lstmcell.h>
  35 #include <multiout_layer.h>
  36 #include <network_graph.h>
  37 #include <nntrainer_error.h>
  38 #include <nntrainer_log.h>
  39 #include <profiler.h>
  40 #include <rnn.h>
  41 #include <rnncell.h>
  42 #include <split_layer.h>
  43 #include <time_dist.h>
  44 #include <tracer.h>
  45 #include <util_func.h>
  46
  47 #define LNODE(x) std::static_pointer_cast<LayerNode>(x)
  48
  49 namespace nntrainer {
  50
  51 int NetworkGraph::compile(const std::string &loss_type) {
  52   int status = ML_ERROR_NONE;
  53
  54   status = isCompilable();
  55   NN_RETURN_STATUS();
  56
  57   try {
  58     setOutputConnections();
  59   } catch (std::exception &e) {
  60     ml_loge("setting output layer failed, reason: %s", e.what());
  61     return ML_ERROR_INVALID_PARAMETER;
  62   }
  63
  64   graph.realizeInputOutputNode();
  65
  66   try {
  67     /// @todo realize loss beforehand
  68     status = addLossLayer(loss_type);
  69     NN_RETURN_STATUS();
  70   } catch (const std::exception &e) {
  71     ml_loge("%s", e.what());
  72     status = ML_ERROR_INVALID_PARAMETER;
  73     NN_RETURN_STATUS();
  74   }
  75
  76   graph.topologicalSort();
  77
  78   setExecutionOrder();
  79   forward_iter_end = (*(cend() - 1)).get();
  80
  81   inPlaceOptimize();
  82
  83   status = checkCompiledGraph();
  84   NN_RETURN_STATUS();
  85
  86   compiled = true;
  87
  88   return status;
  89 }
  90
  91 void NetworkGraph::setExecutionOrder() {
  92   auto backward_order = graph.size();
  93   for (auto iter = getBackwardingBeginIter(); iter != getBackwardingEndIter();
  94        iter++) {
  95     auto &node = *iter;
  96     auto order_idx = getBackwardingEndIter() - iter - 1;
  97     auto forward_order = order_idx;
  98     auto calc_gradient_order = backward_order;
  99     if (node->getTrainable())
 100       backward_order++;
 101     auto calc_derivative_order = backward_order;
 102     if (node->getTrainable())
 103       backward_order++;
 104     auto apply_gradient_order = backward_order++;
 105
 106     node->setExecutionOrder({forward_order, calc_gradient_order,
 107                              calc_derivative_order, apply_gradient_order});
 108   }
 109
 110   /**
 111    * This sets max execution order temporarily till model is initialized.
 112    * This set max execution order is used to extend gradient exec orders for
 113    * clipping.
 114    */
 115   graph_exec_end = std::get<3>((*(cbegin()))->getExecutionOrder());
 116 }
 117
 118 void NetworkGraph::addLayerNode(std::unique_ptr<Layer> layer) {
 119   graph.addNode(std::make_unique<LayerNode>(std::move(layer)));
 120 }
 121
 122 int NetworkGraph::addLossLayer(const std::string &loss_type_) {
 123   for (unsigned int i = 0; i < graph.getNumOutputNodes(); ++i) {
 124     auto output_layer_node = LNODE(graph.getOutputNode(i));
 125     std::string loss_type = loss_type_;
 126
 127     if (output_layer_node->requireLabel())
 128       continue;
 129
 130     if (loss_type.empty())
 131       continue;
 132
 133     auto second_to_last_layer_node = output_layer_node;
 134     bool is_cross_entropy_loss =
 135       istrequal(loss_type, CrossEntropyLossLayer::type);
 136     if (is_cross_entropy_loss) {
 137       auto type = output_layer_node->getType();
 138
 139       if (type != ActivationLayer::type) {
 140         throw exception::not_supported(
 141           "Error: Cross Entropy need last layer to have softmax or sigmoid"
 142           "activation.");
 143       }
 144
 145       switch (output_layer_node->getActivationType()) {
 146       case ActivationType::ACT_SIGMOID:
 147         loss_type = CrossEntropySigmoidLossLayer::type;
 148         break;
 149       case ActivationType::ACT_SOFTMAX:
 150         loss_type = CrossEntropySoftmaxLossLayer::type;
 151         break;
 152       default:
 153         throw exception::not_supported(
 154           "Error: Cross Entropy not supported without softmax or sigmoid.");
 155       }
 156
 157       second_to_last_layer_node =
 158         LNODE(graph.getNode(output_layer_node->getInputConnectionName(0)));
 159     }
 160
 161     std::shared_ptr<LayerNode> lnode = createLayerNode(loss_type);
 162     graph.ensureName(*lnode);
 163
 164     if (second_to_last_layer_node->getDistribute()) {
 165       lnode->setProperty({"distribute=true"});
 166     }
 167
 168     /// @todo remove this by add loss at realization
 169     second_to_last_layer_node->setOutputLayers({lnode->getName()});
 170     lnode->setProperty(
 171       {"input_layers=" + second_to_last_layer_node->getName()});
 172
 173     if (is_cross_entropy_loss) {
 174       graph.replaceNode(output_layer_node, lnode);
 175     } else {
 176       graph.addNode(lnode, false);
 177     }
 178     graph.replaceOutputNode(i, lnode);
 179   }
 180
 181   return ML_ERROR_NONE;
 182 }
 183
 184 void NetworkGraph::setOutputConnections() {
 185   for (auto layer_iter = cbegin(); layer_iter != cend(); layer_iter++) {
 186     const auto &node = *layer_iter;
 187     for (auto i = 0u, num_inode = node->getNumInputConnections(); i < num_inode;
 188          ++i) {
 189       const auto &name = node->getInputConnectionName(i);
 190       const auto &idx = node->getInputConnectionIndex(i);
 191
 192       auto node_setting_output = getLayerNode(name);
 193       node_setting_output->setOutputConnection(idx, node->getName(), i);
 194     }
 195   }
 196 }
 197
 198 int NetworkGraph::isCompilable() {
 199   if (compiled) {
 200     ml_loge("Graph is already compiled");
 201     return ML_ERROR_NOT_SUPPORTED;
 202   }
 203
 204   if (graph.empty()) {
 205     ml_loge("Graph is empty");
 206     return ML_ERROR_INVALID_PARAMETER;
 207   }
 208
 209   return ML_ERROR_NONE;
 210 }
 211
 212 int NetworkGraph::checkCompiledGraph() {
 213   /** Dimension of input layers must be known */
 214   for (auto iter = cbegin(); iter != cend(); iter++) {
 215     auto lnode = (*iter);
 216     if (lnode->getNumInputConnections() == 0) {
 217       if (!lnode->hasInputShapeProperty()) {
 218         ml_loge("Layer with no inbound connection need input_shape property");
 219         return ML_ERROR_INVALID_PARAMETER;
 220       }
 221     }
 222   }
 223
 224   return ML_ERROR_NONE;
 225 }
 226
 227 void NetworkGraph::markNodesForBackwarding() {
 228   /** accumulate all the nodes which must support backwarding */
 229   std::unordered_set<std::string> must_support_backwarding;
 230
 231   /**
 232    * if a node is trainable, then all the nodes ahead of it must support
 233    * backwarding operation
 234    */
 235   for (auto iter = cbegin(); iter != cend(); iter++) {
 236     auto lnode = (*iter);
 237     if (lnode->getTrainable() ||
 238         must_support_backwarding.find(lnode->getName()) !=
 239           must_support_backwarding.end()) {
 240       if (lnode->getTrainable()) {
 241         lnode->needsCalcGradient(true);
 242       }
 243 #ifdef ENABLE_TEST
 244       if (lnode->supportBackwarding() && !optimize_memory) {
 245         lnode->needsCalcDerivative(true);
 246       }
 247 #endif
 248
 249       for (auto i = 0u, num_node = lnode->getNumOutputConnections();
 250            i < num_node; ++i) {
 251         auto conn = lnode->getOutputConnection(i);
 252         if (!conn) {
 253           continue;
 254         }
 255
 256         must_support_backwarding.insert(conn->getName());
 257       }
 258     }
 259   }
 260
 261   /** mark all the required nodes support backwarding */
 262   for (auto const &node_name : must_support_backwarding) {
 263     auto ln = LNODE(graph.getNode(node_name)).get();
 264     ln->needsCalcDerivative(true);
 265   }
 266 }
 267
 268 void NetworkGraph::setBatchSize(unsigned int batch_size) {
 269   if (batch_size == this->batch_size)
 270     return;
 271
 272   this->batch_size = batch_size;
 273   if (!input_list.empty() && getInputDimension()[0].batch() == batch_size)
 274     return;
 275
 276   auto allocated = tensor_manager->isAllocated();
 277
 278   if (allocated)
 279     deallocateTensors();
 280
 281   for (auto iter = cbegin(); iter != cend(); iter++) {
 282     if ((*iter)->isFinalized()) {
 283       /// resize tensors spec
 284       /// @todo remove below, if cutsom tensor needs to change dimension
 285       /// according to the tensor, it must be done explicitly, or at least have
 286       /// a property to control the behavior
 287       const RunLayerContext &context = (*iter)->getRunContext();
 288       for (unsigned int idx = 0; idx < context.getNumTensors(); idx++) {
 289         auto const &ts = context.getTensor(idx);
 290         tensor_manager->setBatchSize(ts.getName(), ts.getDim().batch());
 291         if (context.tensorHasGradient(idx)) {
 292           auto const &ts_grad = context.getTensorGrad(idx);
 293           tensor_manager->setBatchSize(ts_grad.getName(),
 294                                        ts_grad.getDim().batch());
 295         }
 296       }
 297       /// override setting batch as per request
 298       (*iter)->setBatch(batch_size);
 299     }
 300   }
 301   /// resize input and output spec
 302   tensor_manager->setBatchSize(batch_size);
 303
 304   if (allocated)
 305     allocateTensors(exec_mode);
 306
 307   /** update input and label dimensions */
 308   for (unsigned int idx = 0; idx < input_list.size(); idx++)
 309     input_dims[idx] = tensor_manager->getTensor(input_list[idx])->getDim();
 310   for (unsigned int idx = 0; idx < label_list.size(); idx++)
 311     label_dims[idx] = tensor_manager->getTensor(label_list[idx])->getDim();
 312 }
 313
 314 void NetworkGraph::applyGradients(
 315   LayerNode *node, const std::function<void(Weight &)> &apply_func) {
 316
 317   if (!node->getTrainable())
 318     return;
 319
 320   TRACE_MEMORY() << node->getName() + ": AG";
 321   TRACE_TIME() << node->getName() + ": AG";
 322
 323   auto &rc = node->getRunContext();
 324   auto num_weight = rc.getNumWeights();
 325   for (unsigned i = 0; i < num_weight; ++i) {
 326     if (!rc.weightHasGradient(i)) {
 327       continue;
 328     }
 329
 330     if (!rc.isGradientLastAccess(i)) {
 331       /// @note instead of checking the last access of the weight, checking
 332       /// if weights are dependent to others to minimize overhead.
 333       /// this logic assums that the source of the dependent weight must be
 334       /// prior to the dependent.
 335       continue;
 336     }
 337
 338     if (rc.isGradientClipByGlobalNorm(i)) {
 339       /**
 340        * @note the weights whose gradient are to be clipped by global norm will
 341        * be clipped at once at the end of iteration and applied then.
 342        */
 343       continue;
 344     }
 345
 346     apply_func(rc.getWeightObject(i));
 347   }
 348 }
 349
 350 sharedConstTensors NetworkGraph::forwarding(
 351   bool training,
 352   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op,
 353   std::function<bool(void *userdata)> stop_cb, void *userdata) {
 354   for (auto iter = cbegin(); iter != cend() && !stop_cb(userdata); iter++) {
 355     auto &ln = *iter;
 356     PROFILE_TIME_START(profile_keys.at(ln->getType()));
 357     forwarding_op(*iter, training);
 358     PROFILE_TIME_END(profile_keys.at(ln->getType()));
 359   }
 360
 361   sharedConstTensors out;
 362   for (unsigned int i = 0; i < graph.getNumOutputNodes(); ++i) {
 363     auto const &output_layer_node = LNODE(graph.getOutputNode(i));
 364     for (unsigned int j = 0; j < output_layer_node->getNumOutputs(); ++j) {
 365       out.push_back(MAKE_SHARED_TENSOR(output_layer_node->getOutput(j)));
 366     }
 367   }
 368
 369   return out;
 370 }
 371
 372 void NetworkGraph::backwarding(
 373   int iteration,
 374   std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
 375   std::function<void(Weight &, int)> &apply_grad_clip_op,
 376   std::function<bool(void *userdata)> stop_cb, void *userdata) const {
 377   /**
 378    * last layer backwarding is run out of this loop
 379    */
 380   auto iter_begin = getBackwardingBeginIter();
 381   auto iter_end = getBackwardingEndIter();
 382
 383   /// there is no layer to train, so backwarding is essentially noop
 384   if (iter_begin == iter_end) {
 385     return;
 386   }
 387
 388   auto const &lptr_begin = (*iter_begin);
 389
 390   if (lptr_begin->requireLabel() == false)
 391     throw std::runtime_error(
 392       "Error: last layer does not accept label, we can't train");
 393
 394   for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
 395     auto &ln = *iter;
 396     PROFILE_TIME_START(profile_keys.at(ln->getType()));
 397     backwarding_op(ln, iteration);
 398     PROFILE_TIME_END(profile_keys.at(ln->getType()));
 399   }
 400
 401   /** perform clipping of the gradients by global norm if any */
 402   if (clip_weights.empty())
 403     return;
 404
 405   /** calculate the global norm */
 406   Tensor global_norm_t(
 407     TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
 408   float *global_norm_data = global_norm_t.getData();
 409   for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
 410     auto const &w = clip_weights[idx];
 411     global_norm_data[idx] = w->getGradientNorm();
 412   }
 413   float global_norm = global_norm_t.l2norm();
 414   /** apply the gradient with the above global norm */
 415   for (auto w : clip_weights) {
 416     w->clipGradientByGlobalNorm(global_norm);
 417   }
 418   /** apply the gradient with the above global norm */
 419   for (auto w : clip_weights) {
 420     apply_grad_clip_op(*w, iteration);
 421   }
 422 }
 423
 424 LayerNode *NetworkGraph::computeBackwardEnd() {
 425   int max_exec_order = -1;
 426   LayerNode *node = nullptr;
 427
 428   if (!optimize_memory) {
 429     return (*cbegin()).get();
 430   }
 431
 432   for (auto iter = getBackwardingBeginIter(); iter != getBackwardingEndIter();
 433        iter++) {
 434     auto &ln = *iter;
 435     const auto &exec_order = ln->getExecutionOrder();
 436     int cur_order = std::get<0>(exec_order);
 437     if (ln->needsCalcDerivative() || ln->needsCalcGradient()) {
 438 #ifdef ENABLE_TEST
 439       cur_order = std::get<2>(exec_order);
 440 #else
 441       cur_order = std::get<1>(exec_order);
 442 #endif
 443     }
 444
 445     NNTR_THROW_IF(max_exec_order == cur_order, std::invalid_argument)
 446       << "layer node: " << ln->getName()
 447       << " has duplicated max_exec_order, this should not happen, current "
 448          "execution order: "
 449       << max_exec_order;
 450
 451     if (max_exec_order < cur_order) {
 452       max_exec_order = cur_order;
 453       node = ln.get();
 454     }
 455   }
 456
 457   return node;
 458 }
 459
 460 /**
 461  * @brief Allocate memory for all the managed tensors
 462  */
 463 void NetworkGraph::allocateTensors(ExecutionMode exec_mode_) {
 464   exec_mode = exec_mode_;
 465   if (exec_mode == ExecutionMode::INFERENCE)
 466     /**
 467      * get the order of execution/usage order for the forwarding of the last
 468      * layer and pass that as the max_exec_order ensuring that all tensors
 469      * with usage less than the max_exec_order are allocated.
 470      */
 471     tensor_manager->allocateTensors(
 472       std::get<0>((*(cend() - 1))->getExecutionOrder()));
 473   else {
 474     /**
 475      * get the order of execution/usage order for the backwarding of the first
 476      * layer (as that will be the last layer to executed in the backwarding)
 477      * and pass that as the max_exec_order ensuring that all tensors with
 478      * usage less than the max_exec_order are allocated.
 479      */
 480     tensor_manager->allocateTensors(
 481       std::get<3>(backward_iter_end->getExecutionOrder()));
 482   }
 483 }
 484
 485 std::vector<TensorDim> NetworkGraph::getInputDimension() const {
 486   NNTR_THROW_IF(input_dims.empty(), std::invalid_argument)
 487     << "[NetworkGraph] the graph has no node identified as input!";
 488   return input_dims;
 489 }
 490
 491 unsigned int NetworkGraph::getBatchSize() const { return batch_size; }
 492
 493 std::vector<TensorDim> NetworkGraph::getOutputDimension() const {
 494   NNTR_THROW_IF(label_dims.empty(), std::invalid_argument)
 495     << "[NetworkGraph] the graph has no node identified as output!";
 496   /// for now, outputting label_dims works, later label dim will be different
 497   /// from output dimension
 498   return label_dims;
 499 }
 500
 501 std::vector<std::shared_ptr<LayerNode>>
 502 NetworkGraph::getUnsortedLayers(const std::string &input_layer,
 503                                 const std::string &output_layer) const {
 504   /// @fixme: this won't work if input, output layers are not in order
 505   /// Further, this function must be removed. There should be rather
 506   /// getAllNames and getLayerByName instead of getUnsortedLayers.
 507
 508   /** count layers after output layer */
 509   unsigned int num_layers_remove_end = 0;
 510   if (!output_layer.empty()) {
 511     for (auto iter = graph.crbegin(); iter != graph.crend(); iter++) {
 512       if ((*iter)->getName() != output_layer)
 513         num_layers_remove_end++;
 514       else
 515         break;
 516     }
 517   }
 518
 519   if (num_layers_remove_end == graph.size())
 520     return {};
 521
 522   /** count layers before input layer */
 523   unsigned int num_layers_remove_start = 0;
 524   if (!input_layer.empty()) {
 525     for (auto iter = graph.cbegin();
 526          iter != graph.cend() - num_layers_remove_end; iter++) {
 527       if ((*iter)->getName() != input_layer)
 528         num_layers_remove_start++;
 529       else
 530         break;
 531     }
 532   }
 533
 534   /** copy the graph and return */
 535   std::vector<std::shared_ptr<LayerNode>> ret;
 536   std::transform(graph.cbegin() + num_layers_remove_start,
 537                  graph.cend() - num_layers_remove_end, std::back_inserter(ret),
 538                  [](auto const &elem) { return LNODE(elem); });
 539
 540   return ret;
 541 }
 542
 543 std::vector<std::shared_ptr<LayerNode>> NetworkGraph::getLayerNodes() const {
 544   return std::vector<std::shared_ptr<LayerNode>>(cbegin(), cend());
 545 }
 546
 547 void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {
 548   if (compiled)
 549     throw std::runtime_error("Cannot modify graph after compile");
 550
 551   /** Insert the layer to the graph */
 552   graph.addNode(layer);
 553 }
 554
 555 InPlace
 556 NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
 557   if (!lnode->supportInPlace())
 558     return InPlace::NONE;
 559
 560   /** layers which behave as a no-op - flatten */
 561   auto no_op = [](const std::shared_ptr<LayerNode> &lnode) {
 562     return lnode->getType() == FlattenLayer::type ||
 563            lnode->getType() == IdentityLayer::type;
 564   };
 565
 566   /** layers which behave as a no-op but shares memory among parallel nodes -
 567    * multiout */
 568   auto no_op_shared = [](const std::shared_ptr<LayerNode> &lnode) {
 569     return lnode->getType() == MultiOutLayer::type;
 570   };
 571
 572   /**
 573    * layers whose backwarding is not dependent on input/output but only its
 574    * derivatives and weights, if any - batch normalization
 575    */
 576   auto io_independent_backwarding =
 577     [](const std::shared_ptr<LayerNode> &lnode) {
 578       return (lnode->getType() == BatchNormalizationLayer::type) ||
 579              (lnode->getType() == LayerNormalizationLayer::type);
 580     };
 581
 582   /**
 583    * @note Conditions to decide if this layer node can be in-place:
 584    * 1. if the layer is a no-op, then it can operate in-place as it is not
 585    * modifying its input/output tensors and does not need to check its
 586    * neighboring nodes for dependency.
 587    * 2. if the layer is not supporting backwarding, there is no dependency
 588    * requirement with other nodes for backwarding.
 589    *
 590    * @note Conditions to decide the type of inplace for this layer:
 591    * 1. if the previous layers were restricting, then this layer will also be
 592    * restricting.
 593    * 2. if the previous layer were non_restricting or not inplace, then this
 594    * layer will be non-restricting.
 595    */
 596   if (no_op(lnode) || !lnode->supportBackwarding()) {
 597     for (auto i = 0u, num_node = lnode->getNumInputConnections(); i < num_node;
 598          ++i) {
 599       const auto &input_name = lnode->getInputConnectionName(i);
 600       if (getLayerNode(input_name)->executeInPlace() == InPlace::RESTRICTING)
 601         return InPlace::RESTRICTING;
 602     }
 603     return InPlace::NON_RESTRICTING;
 604   }
 605
 606   /**
 607    * @note Conditions to decide if this layer node can be in-place:
 608    * if the layer is a no-op-shared, then it can operate in-place as it is not
 609    * modifying its input/output tensors and does not need to check its
 610    * neighboring nodes for dependency.
 611    *
 612    * @note Conditions to decide the type of inplace for this layer:
 613    * As all the output nodes are sharing memory, the output nodes cant execute
 614    * inplace, and then its restricting mode.
 615    */
 616   if (no_op_shared(lnode))
 617     return InPlace::RESTRICTING;
 618
 619   /**
 620    * @note Conditions to decide if this layer node can be in-place:
 621    * This is a generic case where the layer can support in-place but will
 622    * modify its input in-place. This includes layers like activation, etc.
 623    * Apply checks below to ensure that the layers can work in-place:
 624    * - if any of the input layer are restriction, then this layer cannot work
 625    *   as layers behind this layer have added restrictions.
 626    * - if all of the input layers are either not inplace or have no
 627    * restrictions, then this layer can operate in-place.
 628    *
 629    * @note Conditions to decide the type of inplace for this layer:
 630    * This is a generic case, and always restrictions on the next nodes to be
 631    * not inplace.
 632    *
 633    * @note This logic is prone to change as more layers are allowed to
 634    * work in-place such as concat layer, split layer, addition layer, dropout
 635    * layer, etc.
 636    *
 637    * @todo This logic sets layers to in-place one-by-one as they arrive. However
 638    * setting some layers to in-place can save more memory than others (like
 639    * multiout layer vs activation layer). The layers need to sorted based on the
 640    * memory save they provide and then make them in-place in that order.
 641    */
 642   if (lnode->getType() == ActivationLayer::type ||
 643       lnode->getType() == BatchNormalizationLayer::type ||
 644       lnode->getType() == LayerNormalizationLayer::type) {
 645     for (auto i = 0u, num_node = lnode->getNumInputConnections(); i < num_node;
 646          ++i) {
 647       if (getLayerNode(lnode->getInputConnectionName(i))->executeInPlace() ==
 648           InPlace::RESTRICTING)
 649         return InPlace::NONE;
 650     }
 651
 652     /**
 653      * if the layer does io_independent_backwarding where the input and output
 654      * is not required during backwarding, then it is a non-restricting in-place
 655      * layer.
 656      */
 657     if (io_independent_backwarding(lnode))
 658       return InPlace::NON_RESTRICTING;
 659
 660     return InPlace::RESTRICTING;
 661   }
 662
 663   return InPlace::NONE;
 664 }
 665
 666 void NetworkGraph::inPlaceOptimize() {
 667   if (optimize_memory) {
 668     for (unsigned int idx = 0; idx < graph.size(); ++idx) {
 669       auto const &lnode = getSortedLayerNode(idx);
 670       lnode->executeInPlace(canExecuteInPlace(lnode));
 671     }
 672   }
 673 }
 674
 675 /**
 676  * @brief Set the Inplace Shared Memory Config By Layer object
 677  *
 678  * @param lnode layer node object
 679  * @param shared_var if the variable should be shared
 680  * @param shared_grad if the gradient should be shared
 681  */
 682 static void
 683 setInplaceSharedMemoryConfigByLayer(const std::shared_ptr<LayerNode> &lnode,
 684                                     bool &shared_var, bool &shared_grad) {
 685   /** for multiout layer, variables are shared but gradients are not */
 686   if (lnode->getType() == MultiOutLayer::type) {
 687     shared_var = true;
 688     shared_grad = false;
 689   } else {
 690     shared_var = true;
 691     shared_grad = true;
 692   }
 693   /** @todo for addition layer, variables are not shared but gradients are */
 694   /**
 695    * @todo for layers which support in-place, both variables and gradients
 696    * will be shared.
 697    *
 698    * @todo add a check here is the layer being checked here can support
 699    * in-place or not
 700    */
 701 }
 702
 703 std::vector<Var_Grad *>
 704 NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
 705                               const std::vector<Var_Grad *> &prev_inputs) {
 706   const GraphNode &gnode = *lnode.get();
 707   std::vector<TensorDim> input_dims;
 708   input_dims.reserve(prev_inputs.size());
 709   std::transform(prev_inputs.begin(), prev_inputs.end(),
 710                  std::back_inserter(input_dims),
 711                  [](const Var_Grad *vg) { return vg->getDim(); });
 712
 713   /** finalize the layer and get the final context */
 714   auto init_context = lnode->finalize(input_dims);
 715
 716   /**
 717    * Request manager for either a pre-allocated output as input or a newly
 718    * allocated output. This is necessary for manager to know when this output
 719    * node is going to be used.
 720    */
 721   std::vector<std::string> input_names;
 722   input_names.reserve(prev_inputs.size());
 723   std::transform(prev_inputs.begin(), prev_inputs.end(),
 724                  std::back_inserter(input_names),
 725                  [](auto const &vg) { return vg->getName(); });
 726   const std::vector<Var_Grad *> &inputs = tensor_manager->requestInputs(
 727     gnode, init_context.getInputDimensions(), input_names);
 728
 729   /** In-Place optimizations */
 730   /**
 731    * Request manager for either a pre-allocated input as output or a newly
 732    * allocated output. This is necessary for manager to know when this output
 733    * node is going to be used with in-place optimizations.
 734    */
 735   auto out_specs = init_context.getOutSpecs();
 736   /// @note try move inplace control to finalize
 737   bool shared_var = false, shared_grad = false;
 738   if (lnode->executeInPlace() != InPlace::NONE) {
 739     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
 740     for (unsigned int i = 0; i < out_specs.size(); ++i) {
 741       auto &s = out_specs.at(i);
 742       if (shared_var) {
 743         s.variable_spec.request_type =
 744           TensorSpecV2::RequestType::READ_ONLY_VIEW;
 745         if (lnode->getType() == IdentityLayer::type) {
 746           s.variable_spec.reference_name = inputs[i]->getName();
 747         } else {
 748           s.variable_spec.reference_name = inputs[0]->getName();
 749         }
 750       }
 751       if (shared_grad && s.gradient_spec) {
 752         s.gradient_spec->request_type =
 753           TensorSpecV2::RequestType::READ_ONLY_VIEW;
 754         if (lnode->getType() == IdentityLayer::type) {
 755           s.gradient_spec->reference_name = inputs[i]->getGradientName();
 756         } else {
 757           s.gradient_spec->reference_name = inputs[0]->getGradientName();
 758         }
 759       }
 760     }
 761   }
 762   if (lnode->requireLabel()) {
 763     NNTR_THROW_IF(out_specs.size() != 1, std::invalid_argument)
 764       << "out specification size must be 1 for label layer for now, "
 765       << lnode->getName() << " out spec size: " << out_specs.size();
 766     NNTR_THROW_IF(out_specs[0].gradient_spec == nullptr, std::invalid_argument)
 767       << "label space does not exist for " << lnode->getName();
 768     out_specs[0].gradient_spec->request_type =
 769       TensorSpecV2::RequestType::PLACEHOLDER;
 770   }
 771
 772   /// @note below needs to be enabled only for inference mode, but need decision
 773   /// if we are going to separate inference initialization from train
 774   /// initialization this might not worth optimize because in general output of
 775   /// a neuralnet is very small
 776   if (lnode->getOutputConnections().size() == 0u) {
 777     std::for_each(out_specs.begin(), out_specs.end(),
 778                   [this](VarGradSpecV2 &spec) {
 779                     spec.variable_spec.additional_exec_order.push_back(
 780                       std::get<0>(forward_iter_end->getExecutionOrder()));
 781                   });
 782   }
 783
 784   if (lnode->getType() == RNNCellLayer::type or
 785       lnode->getType() == LSTMCellLayer::type or
 786       lnode->getType() == GRUCellLayer::type) {
 787     std::for_each(
 788       out_specs.begin(), out_specs.end(), [this](VarGradSpecV2 &spec) {
 789         spec.variable_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
 790       });
 791   }
 792
 793   const std::vector<Var_Grad *> &outputs = tensor_manager->requestTensors(
 794     out_specs, Manager::TensorGroupType::OUTPUT, lnode->getExecutionOrder(),
 795     lnode->getName());
 796
 797   /** create shared weight names if requested */
 798   std::vector<std::string> shared_weight_names;
 799   std::vector<std::string> shared_tensor_names;
 800   if (auto shared_node_str = lnode->getSharedFrom(); !shared_node_str.empty()) {
 801     /// @note below is commented but kept from quick fix to be referenced for
 802     /// later(#1707)
 803     // auto shared_node = getLayerNode(shared_node_str).get();
 804     // NNTR_THROW_IF(shared_node == nullptr, std::invalid_argument)
 805     //   << "shared_node requested but it is not registered in the graph,
 806     //   name:
 807     //   "
 808     //   << shared_node_str << " requested from " << lnode->getName();
 809     // NNTR_THROW_IF(shared_node->getType() != lnode->getType(),
 810     //               std::invalid_argument)
 811     //   << " shared_node and lnode type mismatch, source node type: "
 812     //   << shared_node->getType() << " depedent node type: " <<
 813     //   lnode->getType()
 814     //   << " depedent node name: " << lnode->getName();
 815     // NNTR_THROW_IF(!shared_node->isFinalized(), std::invalid_argument)
 816     //   << "shared node must be prior to the dependent node and it should be
 817     //   "
 818     //      "finalized beforehand, shared node name: "
 819     //   << shared_node_str << " dependent node name: " << lnode->getName();
 820     // auto num_weight = shared_node->getNumWeights();
 821     // shared_weight_names.reserve(num_weight);
 822     // for (auto i = 0u; i < num_weight; ++i) {
 823     //   shared_weight_names.emplace_back(shared_node->getWeightName(i));
 824     // }
 825     // auto &rc = node->getRunContext();
 826
 827     /// @fixme tensor should be only shared if context explicitly requested to
 828     /// do so. This has to be added to the part of tensor spec, other wise it
 829     /// will break many things
 830     const auto &t_specs = init_context.getTensorsSpec();
 831     for (auto i = 0u; i < t_specs.size(); ++i) {
 832       shared_tensor_names.emplace_back(std::get<3>(t_specs.at(i)));
 833     }
 834
 835     const auto &w_specs = init_context.getWeightsSpec();
 836     for (auto i = 0u; i < w_specs.size(); ++i) {
 837       shared_weight_names.emplace_back(std::get<7>(w_specs.at(i)));
 838     }
 839   }
 840
 841   lnode->configureRunContext(
 842     // TODO: update weights spec for trainable based on layer trainable prop
 843     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
 844                                    lnode->getTrainable(), shared_weight_names),
 845     inputs, outputs,
 846     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
 847                                    lnode->getTrainable(), shared_tensor_names));
 848
 849   return outputs;
 850 }
 851
 852 int NetworkGraph::initialize(const std::vector<Connection> &model_input_names,
 853                              const std::vector<Connection> &model_label_names) {
 854
 855   /**
 856    * this contains the map from node name to its input tensor names
 857    * @note: these input tensors have already been allocated
 858    */
 859   std::unordered_map<std::string, std::vector<Var_Grad *>> input_map;
 860
 861   /** check if the given config of node is of input node */
 862   auto is_input_node = [](const LayerNode *node) -> bool {
 863     return node->getInputConnections().empty();
 864   };
 865
 866   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
 867     std::vector<Var_Grad *> inputs = {};
 868     auto const &lnode = getSortedLayerNode(idx);
 869
 870     if (profile_keys.find(lnode->getType()) == profile_keys.end()) {
 871       int event_key = 0;
 872       PROFILE_TIME_REGISTER_EVENT(event_key, lnode->getType());
 873       profile_keys[lnode->getType()] = event_key;
 874     }
 875
 876     /**
 877      * Set input dimension for all the layers.
 878      * For input layer, as input dimension is known, set input tensor.
 879      */
 880     if (!is_input_node(lnode.get())) {
 881       if (input_map.find(lnode->getName()) == input_map.end())
 882         throw std::runtime_error("Cannot find input buffers for the node");
 883       inputs = input_map.at(lnode->getName());
 884     }
 885
 886     /**
 887      * Initialize all the layers, allocate output tensors for each layer
 888      * init2and add optimizer related weights for the layer
 889      */
 890     const std::vector<Var_Grad *> &outputs = finalizeContext(lnode, inputs);
 891
 892     /** no need to update input_map for the last layer */
 893     if (idx == graph.size() - 1)
 894       break;
 895
 896     for (auto i = 0u, num_node = lnode->getNumOutputConnections(); i < num_node;
 897          ++i) {
 898       auto conn = lnode->getOutputConnection(i);
 899       if (!conn) {
 900         ml_logi("out connection not defined for  %s, %u",
 901                 lnode->getName().c_str(), i);
 902         continue;
 903       }
 904
 905       auto sink_node = getLayerNode(conn->getName());
 906       [[maybe_unused]] auto [it, b] =
 907         input_map.try_emplace({sink_node->getName(), {}});
 908
 909       NNTR_THROW_IF(sink_node->getInputConnectionName(conn->getIndex()) !=
 910                       lnode->getName(),
 911                     std::invalid_argument)
 912         << "node pair does not match between " << lnode->getName() << ' '
 913         << sink_node->getName();
 914
 915       auto &sink_tensors = it->second;
 916       sink_tensors.resize(sink_node->getNumInputConnections());
 917       sink_tensors[conn->getIndex()] = outputs[i];
 918     }
 919   }
 920
 921   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
 922     auto const &lnode = getSortedLayerNode(idx);
 923     auto &rc = lnode->getRunContext();
 924     auto first_grad_access = std::get<1>(lnode->getExecutionOrder());
 925     auto last_grad_access = std::get<3>(lnode->getExecutionOrder());
 926     for (unsigned i = 0; i < rc.getNumWeights(); ++i) {
 927       if (!rc.weightHasGradient(i)) {
 928         /// @todo this is duck taping that MUST BE REMOVED. We will need to
 929         /// have, is weight first access kind of concept.
 930         if (tensor_manager->isFirstAccess(
 931               rc.getWeight(i).getName(),
 932               std::get<0>(lnode->getExecutionOrder()), true)) {
 933           rc.getWeightObject(i).setAsGradientFirstAccess();
 934         }
 935         if (tensor_manager->isLastAccess(rc.getWeight(i).getName(),
 936                                          last_grad_access, true)) {
 937           rc.getWeightObject(i).setAsGradientLastAccess();
 938         }
 939       } else {
 940         if (tensor_manager->isFirstAccess(rc.getWeightGrad(i).getName(),
 941                                           first_grad_access)) {
 942           rc.getWeightObject(i).setAsGradientFirstAccess();
 943         }
 944         /**
 945          * if the gradient is to be clipped by global norm, then the last access
 946          * is by clipping itself. However, as clipping is not a layer and does
 947          * not contain any weights, such weights never get assigned
 948          * gradient_last_access. This is a quick hotfix.
 949          * TODO: make an independent clipping layer which will execute at the
 950          * end, and will share ownership of weights which it will clip. This
 951          * will remove this hot fix, and also remove the checks of if weights
 952          * require clipping.
 953          */
 954         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
 955                                          last_grad_access) ||
 956             (rc.isGradientClipByGlobalNorm(i) &&
 957              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
 958                                                 last_grad_access))) {
 959           rc.getWeightObject(i).setAsGradientLastAccess();
 960         }
 961       }
 962     }
 963   }
 964   /**** identify model input / output to be set externally later ****/
 965   auto identify_as_model_input = [this](LayerNode *node) {
 966     auto num_input = node->getNumInputs();
 967     NNTR_THROW_IF(num_input != 1, std::invalid_argument)
 968       << "Input layer is supposed to have exactly one input, but more then "
 969          "one input detected, num inputs: "
 970       << num_input;
 971
 972     input_list.push_back(node->getInput(0).getName());
 973     input_dims.push_back(node->getInputDimensions()[0]);
 974   };
 975
 976   auto is_label_node = [](LayerNode *node) { return node->requireLabel(); };
 977
 978   auto identify_as_model_label = [this](LayerNode *node) {
 979     /// @todo change this as lnode->getNumLabels of sorts
 980     auto num_label = node->getNumOutputs();
 981     NNTR_THROW_IF(!node->getOutputConnections().empty(), std::invalid_argument)
 982       << "label layer is supposed to be a leaf for now";
 983     NNTR_THROW_IF(num_label != 1, std::invalid_argument)
 984       << "label layer is supposed to have exactly one label, but more then "
 985          "one label detected, num labels: "
 986       << num_label;
 987
 988     /// @todo implement and use getLabel(0) instead.
 989     output_list.push_back(node->getOutput(0).getName());
 990     label_list.push_back(node->getOutputGrad(0).getName());
 991     label_dims.push_back(node->getOutputDimensions()[0]);
 992   };
 993
 994   auto identify_external_tensors = [this](const std::vector<Connection> &conns,
 995                                           auto &&pred, auto &&identify) {
 996     if (conns.empty()) {
 997       for (unsigned int i = 0; i < graph.size(); ++i) {
 998         auto lnode = getSortedLayerNode(i).get();
 999         if (!pred(lnode)) {
1000           continue;
1001         }
1002         /// when name is empty, we identify everything as the node, all of
1003         /// them must be having identical dimensions
1004         identify(lnode);
1005       }
1006     } else {
1007       for (auto &conn : conns) {
1008         auto lnode = getLayerNode(conn.getName()).get();
1009         NNTR_THROW_IF(!pred(lnode), std::invalid_argument)
1010           << "given node is not of that kind, name: " << conn.getName();
1011         identify(lnode);
1012       }
1013       unsigned int num_node_of_kind = 0;
1014       for (unsigned int i = 0; i < graph.size(); ++i) {
1015         auto lnode = getSortedLayerNode(i).get();
1016         if (!pred(lnode)) {
1017           continue;
1018         }
1019         num_node_of_kind++;
1020       }
1021       NNTR_THROW_IF(num_node_of_kind != conns.size(), std::invalid_argument)
1022         << "conns given but there are not identified node of the kind, num "
1023            "node of kind: "
1024         << num_node_of_kind << " identifier size: " << conns.size();
1025     }
1026   };
1027
1028   identify_external_tensors(model_input_names, is_input_node,
1029                             identify_as_model_input);
1030   identify_external_tensors(model_label_names, is_label_node,
1031                             identify_as_model_label);
1032
1033   /** mark the nodes which will be backwarded during the graph operation */
1034   try {
1035     markNodesForBackwarding();
1036     backward_iter_end = computeBackwardEnd();
1037   } catch (std::exception &e) {
1038     ml_loge(
1039       "Backwarding required from layer which doesn't support backwarding: %s",
1040       e.what());
1041     return ML_ERROR_INVALID_PARAMETER;
1042   }
1043
1044   /** select weights which would require clipping of the gradients by global
1045    * norm if any */
1046   clip_weights = tensor_manager->getWeights([](const Weight *w) {
1047     return w->hasGradient() && w->isGradientLastAccess() &&
1048            w->isGradientClipByGlobalNorm();
1049   });
1050
1051   return ML_ERROR_NONE;
1052 }
1053
1054 void NetworkGraph::setExternalTensors(const std::vector<Tensor> &data,
1055                                       const std::vector<std::string> names) {
1056
1057   /// feed or clear label
1058   for (unsigned int idx = 0; idx < names.size(); idx++) {
1059     if (data.empty())
1060       tensor_manager->fillPlaceholder(names[idx], Tensor());
1061     else if (data.size() == 1)
1062       tensor_manager->fillPlaceholder(names[idx], data[0]);
1063     else
1064       tensor_manager->fillPlaceholder(names[idx], data[idx]);
1065   }
1066 }
1067
1068 void NetworkGraph::setInputsLabels(const std::vector<Tensor> &inputs,
1069                                    const std::vector<Tensor> &labels) {
1070
1071   NNTR_THROW_IF(labels.size() > 1 && labels.size() != label_list.size(),
1072                 std::invalid_argument)
1073     << "label size does not match with the network requirements"
1074     << " label size: " << labels.size()
1075     << " requirements size: " << label_list.size();
1076
1077   NNTR_THROW_IF(inputs.size() > 1 && inputs.size() != input_list.size(),
1078                 std::invalid_argument)
1079     << "input size does not match with the network requirements"
1080     << " input size: " << inputs.size()
1081     << " requirements size: " << input_list.size();
1082
1083   setExternalTensors(inputs, input_list);
1084   setExternalTensors(labels, label_list);
1085 }
1086
1087 void NetworkGraph::setInputsLabels(sharedConstTensors &inputs,
1088                                    sharedConstTensors &labels) {
1089
1090   std::vector<Tensor> ins;
1091   std::transform(inputs.begin(), inputs.end(), std::back_inserter(ins),
1092                  [](auto const &val) { return *val.get(); });
1093
1094   std::vector<Tensor> labs;
1095   std::transform(labels.begin(), labels.end(), std::back_inserter(labs),
1096                  [](auto const &val) { return *val.get(); });
1097
1098   setInputsLabels(ins, labs);
1099 }
1100
1101 std::vector<Tensor> NetworkGraph::getOutputTensors() const {
1102   std::vector<Tensor> output_tensors;
1103   output_tensors.reserve(output_list.size());
1104
1105   for (auto const &name : output_list)
1106     output_tensors.push_back(*tensor_manager->getTensor(name));
1107
1108   return output_tensors;
1109 }
1110
1111 void NetworkGraph::flushCache() { tensor_manager->flushCache(); }
1112
1113 void NetworkGraph::flushCacheExcept(unsigned int order) {
1114   tensor_manager->flushCacheExcept(order);
1115 }
1116
1117 void NetworkGraph::requestOptimizerVariable(
1118   std::function<std::vector<TensorDim>(const TensorDim &)> cb,
1119   bool request_only_trainable) {
1120   for (auto const &w : tensor_manager->getWeights()) {
1121     if (w->isGradientLastAccess() && w->hasGradient()) {
1122       const TensorDim &dim = w->getDim();
1123       std::vector<TensorDim> dims = cb(dim);
1124       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
1125         dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
1126         w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
1127     }
1128   }
1129 }
1130
1131 } /* namespace nntrainer */