[nnc] Free temporary tensors to reduce memory consumption of ACL artifact (#2767)

author Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>

Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)

committer Роман Михайлович Русяев/AI Tools Lab /SRR/Staff Engineer/삼성전자 <r.rusyaev@samsung.com>

Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
author Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
committer Роман Михайлович Русяев/AI Tools Lab /SRR/Staff Engineer/삼성전자 <r.rusyaev@samsung.com>
Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp

index 1a159e4..f2119e9 100644 (file)
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
@@ -92,7 +92,7 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
  
    // Generate all the deferred entities.
    genNamed();
-  genAllocates();
+  genPersistentTensorAllocations();
    genDeserializations();
    genFillings();
  
@@ -133,7 +133,7 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
    auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
                          {inputs, AF::ref(out), AF::lit(axis_name)});
  
-  allocate(out);
+  addToPersistentTensors(out);
    genLayerExecution(layer);
  }
  
@@ -179,7 +179,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
      // Apply the softmax operation.
      auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                         {AF::ref(in), AF::ref(output)});
-    allocate(output);
+    addToPersistentTensors(output);
      genLayerExecution(sm);
    } else {
      // TODO refactor this code, it works only with 1 batch
@@ -198,19 +198,19 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
      // Do the input permutation.
      auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
                            {AF::ref(in), AF::ref(tmp)});
-    allocate(tmp);
+    addToPersistentTensors(tmp);
      genLayerExecution(transp1);
  
      // Apply the softmax operaion.
      auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                         {AF::ref(tmp), AF::ref(tmp2)});
-    allocate(tmp2);
+    addToPersistentTensors(tmp2);
      genLayerExecution(sm);
  
      // Reshape the output to the original form.
      auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
                            {AF::ref(tmp2), AF::ref(output)});
-    allocate(output);
+    addToPersistentTensors(output);
      genLayerExecution(transp2);
    }
  }
@@ -266,7 +266,8 @@ AclCppOpGenerator::genTransposeMIRtoACL(const string& name,
    Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
    shared_ptr<ArtifactId> transposed_id =
        genTensor(name, transposed_shape, false);
-  genTranspose(input, transposed_id, {0, 3, 1, 2});
+  const bool allocate_at_inference = true;
+  genTranspose(input, transposed_id, {0, 3, 1, 2}, allocate_at_inference);
    return transposed_id;
  }
  
@@ -285,7 +286,9 @@ AclCppOpGenerator::genTransposeACLtoMIR(const string& name,
    Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
    shared_ptr<ArtifactId> transposed_id =
        genTensor(name, transposed_shape, false);
-  genTranspose(input, transposed_id, {0, 2, 3, 1});
+
+  const bool allocate_at_inference = false;
+  genTranspose(input, transposed_id, {0, 2, 3, 1}, allocate_at_inference);
    return transposed_id;
  }
  
@@ -346,12 +349,17 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
    // Actual layer creation
    shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
        {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
    genLayerExecution(layer);
  
    shared_ptr<ArtifactId> output =
        genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
  
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
    if (op.getNextNodes().empty())
      _outputs.insert(&op);
  }
@@ -380,10 +388,10 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
    // Instantiate the CLFullyConnectedLayer object.
    auto layer = genLayer("arm_compute::CLFullyConnectedLayer", operation_name,
                          {AF::ref(in), AF::ref(weights), AF::lit("nullptr"), AF::ref(out)});
-  allocate(weights);
+  addToPersistentTensors(weights);
    // Serialize the weights tensor and generate the function to deserialize it in the artifact.
    serializeTensor(weights, ir_weights);
-  allocate(out);
+  addToPersistentTensors(out);
    genLayerExecution(layer);
  }
  
@@ -455,16 +463,21 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
    auto layer = genLayer("arm_compute::CLArithmeticAddition", layer_name,
                          {AF::ref(transposed_input), AF::ref(biases), AF::ref(transposed_output),
                           AF::lit("arm_compute::ConvertPolicy::WRAP")});
-  allocate(biases);
+  addToPersistentTensors(biases);
    // Save the IR biases tensor to later read this in the artifact.
    serializeTensor(biases, ir_biases);
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
    genLayerExecution(layer);
  
    if (out_shape.rank() == 4) {
      // Generate output in NHWC format
      shared_ptr<ArtifactId> output =
          genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+    if (cli::debugTranspose) {
+      genTensorDeallocation(_infBlock, transposed_input);
+      genTensorDeallocation(_infBlock, transposed_output);
+    }
    }
  
    if (op.getNextNodes().empty())
@@ -481,7 +494,7 @@ void AclCppOpGenerator::visit(ops::VariableOp& op) {
      else
        tensor = genTensor(op, op.getOutputShape(0));
    }
-  allocate(tensor);
+  addToPersistentTensors(tensor);
  }
  
  void AclCppOpGenerator::visit(ops::ConstantOp& op) {
@@ -490,7 +503,7 @@ void AclCppOpGenerator::visit(ops::ConstantOp& op) {
  
    shared_ptr<ArtifactId> out = genTensor(op, out_shape);
  
-  allocate(out);
+  addToPersistentTensors(out);
    // Serialize the weights tensor and generate the function to deserialize it in the artifact.
    serializeTensor(out, data);
  }
@@ -526,7 +539,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
    // Create an instance of the CLReshapeLayer class as a member of the artifact class.
    auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
                          {AF::ref(in), AF::ref(out)});
-  allocate(out);
+  addToPersistentTensors(out);
    genLayerExecution(layer);
  }
  
@@ -600,19 +613,25 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
    auto layer2 = genLayer("arm_compute::CLArithmeticDivision",
                           operation_name + "_arithmetic_div_layer_2",
                           {AF::ref(transposed_input), AF::ref(tmp), AF::ref(transposed_output)});
-  allocate(scales);
+  addToPersistentTensors(scales);
    // Save the IR scales tensor to later read this in the artifact.
    serializeTensor(scales, ir_scales);
-  allocate(unit);
+  addToPersistentTensors(unit);
    // Fill the unit tensor with the 1 value.
    fillTensor(unit, "1");
-  allocate(tmp);
-  allocate(transposed_output);
+  addToPersistentTensors(tmp);
+  genTensorAllocation(_infBlock, transposed_output);
    genLayerExecution(layer2);
  
    // Generate output in NHWC format
    shared_ptr<ArtifactId> output =
        genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
    if (op.getNextNodes().empty())
      _outputs.insert(&op);
  }
@@ -696,7 +715,7 @@ void AclCppOpGenerator::visit(ops::PadOp& op) {
  template <typename Op>
  void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
    auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
-  const auto& ir_weights_shape = ir_weights.getShape();
+  const Shape& ir_weights_shape = ir_weights.getShape();
  
    auto& prev_nodes = op.getPrevNodes();
    assert(prev_nodes.size() == 1);
@@ -743,17 +762,22 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
  
    // Create the convolution (/depthwise convolution/deconvolution) layer class instance.
    shared_ptr<ArtifactId> layer = genLayer(acl_func_name, operation_name, config_params);
-  allocate(weights);
+  addToPersistentTensors(weights);
  
    // Save the IR weights tensor to later read this in the artifact.
    serializeTensor(weights, ir_weights);
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
    genLayerExecution(layer);
  
    // Generate auxiliar tensor to hold transposed output of convolution in NHWC format
    shared_ptr<ArtifactId> output =
        genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
  
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
    if (op.getNextNodes().empty())
      _outputs.insert(&op);
  }
@@ -788,7 +812,7 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
    // Create an instance of the CLActivationLayer class as a member of the artifact class.
    auto layer = genLayer("arm_compute::CLActivationLayer", prefix,
                          {AF::ref(in), AF::ref(output), activation_info});
-  allocate(output);
+  addToPersistentTensors(output);
    genLayerExecution(layer);
  }
  
@@ -1000,13 +1024,25 @@ void AclCppOpGenerator::visit(ops::SqrtOp& op) {
    throw AclCppException("Unimplemented operation: Sqrt");
  }
  
-void AclCppOpGenerator::allocate(std::shared_ptr<ArtifactId> tensor_id) {
-  _allocates.push_back(tensor_id);
+void AclCppOpGenerator::addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id) {
+  _persistent_tensors.push_back(tensor_id);
  }
  
-void AclCppOpGenerator::genAllocates() {
-  for (auto a : _allocates)
-    _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref);
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorAllocation(ArtifactBlock* block,
+                                       const shared_ptr<ArtifactId>& tensor) {
+  return block->call("allocate", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorDeallocation(ArtifactBlock* block,
+                                         const shared_ptr<ArtifactId>& tensor) {
+  return block->call("free", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+void AclCppOpGenerator::genPersistentTensorAllocations() {
+  for (shared_ptr<ArtifactId> tensor: _persistent_tensors)
+    genTensorAllocation(_constrBlock, tensor);
  }
  
  shared_ptr<ArtifactId>
@@ -1032,7 +1068,8 @@ void AclCppOpGenerator::visit(mir::ops::ReduceFOp& op) {
  
  void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
                                       const std::shared_ptr<nnc::ArtifactId>& output,
-                                     const std::vector<size_t>& mir_perm) {
+                                     const std::vector<size_t>& mir_perm,
+                                     bool allocate_at_inference) {
  
    // acl 18.8 opencl implementation supports only 3 types of permutation:
    // in mir (0, 3, 1, 2),  in acl(axes are in reverse order) (1, 2, 0)
@@ -1061,7 +1098,10 @@ void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& inp
    string layer_name = out_name + "_transpose_layer";
    list<shared_ptr<ArtifactExpr>> arguments = {AF::ref(input), AF::ref(output), perm_vector};
    auto layer = genLayer("arm_compute::CLPermute", layer_name, arguments);
-  allocate(output);
+  if (allocate_at_inference)
+    genTensorAllocation(_infBlock, output);
+  else
+    addToPersistentTensors(output);
    genLayerExecution(layer);
  }
  
@@ -1081,7 +1121,7 @@ void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) {
    shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
  
    // Actual generation of operation and related stuff
-  genTranspose(input, output, mir_axis_order);
+  genTranspose(input, output, mir_axis_order, false);
  }
  
  void AclCppOpGenerator::visit(mir::ops::GatherOp& op) {
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h

index b3dc23f..81bb72c 100644 (file)
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h
@@ -84,7 +84,7 @@ private:
    /**
     * @brief generate transpose of input tensor NHWC -> NCHW
     * @param name name of tensor containing transposed data
-   * @param input_shape shape of @p inpu
+   * @param input_shape shape of @p input
     * @param input id of input tensor
     * @return Id of result tensor
     */
@@ -95,7 +95,7 @@ private:
    /**
     * @brief generate transpose NCHW -> NHWC
     * @param name name of tensor containing transposed data
-   * @param input_shape shape of @p inpu
+   * @param input_shape shape of @p input
     * @param input id of input tensor
     * @return Id of result tensor
     */
@@ -212,7 +212,8 @@ private:
     */
     void genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
                       const std::shared_ptr<nnc::ArtifactId>& output,
-                     const std::vector<size_t>& mir_perm);
+                     const std::vector<size_t>& mir_perm,
+                     bool allocate_at_inference);
  
    /**
     * @brief Generates accessors for the input/output tensors.
@@ -250,15 +251,31 @@ private:
    void fillTensor(std::shared_ptr<ArtifactId> tensor_id, const std::string& val);
  
    /**
-   * @brief Schedule the tensor allocation.
+   * @brief Schedule the tensor allocation in the artifact constructor.
     * @param tensor_id - ID of the scheduled tensor.
     */
-  void allocate(std::shared_ptr<ArtifactId> tensor_id);
+  void addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id);
+
+  /**
+   * @brief Generate allocation of tensor
+   * @param block Block to insert allocation in
+   * @param tensor Id of tensor to allocate
+   */
+  std::shared_ptr<ArtifactFunctionCall>
+  genTensorAllocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
+
+  /**
+   * @brief Generate deallocation of tensor
+   * @param block Block to insert deallocation in
+   * @param tensor Id of tensor to deallocate
+   */
+  std::shared_ptr<ArtifactFunctionCall>
+  genTensorDeallocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
  
    /**
     * @brief Generate all the scheduled tensor allocations.
     */
-  void genAllocates();
+  void genPersistentTensorAllocations();
  
    /**
     * @brief Generate the layer declaration and the configure() call.
@@ -337,17 +354,17 @@ private:
    std::shared_ptr<ArtifactId> _clScheduler;
  
    /**
-   * Tensors which need to be allocated in the artifact.
+   * @brief Tensors which need to be allocated at the artifact construction time.
     */
-  std::list<std::shared_ptr<ArtifactId>> _allocates;
+  std::list<std::shared_ptr<ArtifactId>> _persistent_tensors;
  
    /**
-   * Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
+   * @brief Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
     */
    std::list<std::shared_ptr<ArtifactId>> _serializations;
  
    /**
-   * Tensors which must be filled with constant values and the corresponding values.
+   * @brief Tensors which must be filled with constant values and the corresponding values.
     */
    std::list<std::pair<std::shared_ptr<ArtifactId>, std::string>> _fillings;
  };
author	Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
	Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
committer	Роман Михайлович Русяев/AI Tools Lab /SRR/Staff Engineer/삼성전자 <r.rusyaev@samsung.com>
	Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp		patch \| blob \| history
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h		patch \| blob \| history