[nnc] Free temporary tensors to reduce memory consumption of ACL artifact (#2767)
authorEfimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
committerРоман Михайлович Русяев/AI Tools Lab /SRR/Staff Engineer/삼성전자 <r.rusyaev@samsung.com>
Fri, 11 Jan 2019 09:55:25 +0000 (12:55 +0300)
- Remove redundant reverse shape transposes like this [0,1,2,3]->[3,2,1,0]
- Refactor tensor serialization

Signed-off-by: Efimov Alexander <a.efimov@samsung.com>
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h

index 1a159e4..f2119e9 100644 (file)
@@ -92,7 +92,7 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
 
   // Generate all the deferred entities.
   genNamed();
-  genAllocates();
+  genPersistentTensorAllocations();
   genDeserializations();
   genFillings();
 
@@ -133,7 +133,7 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
   auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
                         {inputs, AF::ref(out), AF::lit(axis_name)});
 
-  allocate(out);
+  addToPersistentTensors(out);
   genLayerExecution(layer);
 }
 
@@ -179,7 +179,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     // Apply the softmax operation.
     auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                        {AF::ref(in), AF::ref(output)});
-    allocate(output);
+    addToPersistentTensors(output);
     genLayerExecution(sm);
   } else {
     // TODO refactor this code, it works only with 1 batch
@@ -198,19 +198,19 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     // Do the input permutation.
     auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
                           {AF::ref(in), AF::ref(tmp)});
-    allocate(tmp);
+    addToPersistentTensors(tmp);
     genLayerExecution(transp1);
 
     // Apply the softmax operaion.
     auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                        {AF::ref(tmp), AF::ref(tmp2)});
-    allocate(tmp2);
+    addToPersistentTensors(tmp2);
     genLayerExecution(sm);
 
     // Reshape the output to the original form.
     auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
                           {AF::ref(tmp2), AF::ref(output)});
-    allocate(output);
+    addToPersistentTensors(output);
     genLayerExecution(transp2);
   }
 }
@@ -266,7 +266,8 @@ AclCppOpGenerator::genTransposeMIRtoACL(const string& name,
   Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
   shared_ptr<ArtifactId> transposed_id =
       genTensor(name, transposed_shape, false);
-  genTranspose(input, transposed_id, {0, 3, 1, 2});
+  const bool allocate_at_inference = true;
+  genTranspose(input, transposed_id, {0, 3, 1, 2}, allocate_at_inference);
   return transposed_id;
 }
 
@@ -285,7 +286,9 @@ AclCppOpGenerator::genTransposeACLtoMIR(const string& name,
   Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
   shared_ptr<ArtifactId> transposed_id =
       genTensor(name, transposed_shape, false);
-  genTranspose(input, transposed_id, {0, 2, 3, 1});
+
+  const bool allocate_at_inference = false;
+  genTranspose(input, transposed_id, {0, 2, 3, 1}, allocate_at_inference);
   return transposed_id;
 }
 
@@ -346,12 +349,17 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
   // Actual layer creation
   shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
       {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
   genLayerExecution(layer);
 
   shared_ptr<ArtifactId> output =
       genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
 
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
   if (op.getNextNodes().empty())
     _outputs.insert(&op);
 }
@@ -380,10 +388,10 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
   // Instantiate the CLFullyConnectedLayer object.
   auto layer = genLayer("arm_compute::CLFullyConnectedLayer", operation_name,
                         {AF::ref(in), AF::ref(weights), AF::lit("nullptr"), AF::ref(out)});
-  allocate(weights);
+  addToPersistentTensors(weights);
   // Serialize the weights tensor and generate the function to deserialize it in the artifact.
   serializeTensor(weights, ir_weights);
-  allocate(out);
+  addToPersistentTensors(out);
   genLayerExecution(layer);
 }
 
@@ -455,16 +463,21 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
   auto layer = genLayer("arm_compute::CLArithmeticAddition", layer_name,
                         {AF::ref(transposed_input), AF::ref(biases), AF::ref(transposed_output),
                          AF::lit("arm_compute::ConvertPolicy::WRAP")});
-  allocate(biases);
+  addToPersistentTensors(biases);
   // Save the IR biases tensor to later read this in the artifact.
   serializeTensor(biases, ir_biases);
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
   genLayerExecution(layer);
 
   if (out_shape.rank() == 4) {
     // Generate output in NHWC format
     shared_ptr<ArtifactId> output =
         genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+    if (cli::debugTranspose) {
+      genTensorDeallocation(_infBlock, transposed_input);
+      genTensorDeallocation(_infBlock, transposed_output);
+    }
   }
 
   if (op.getNextNodes().empty())
@@ -481,7 +494,7 @@ void AclCppOpGenerator::visit(ops::VariableOp& op) {
     else
       tensor = genTensor(op, op.getOutputShape(0));
   }
-  allocate(tensor);
+  addToPersistentTensors(tensor);
 }
 
 void AclCppOpGenerator::visit(ops::ConstantOp& op) {
@@ -490,7 +503,7 @@ void AclCppOpGenerator::visit(ops::ConstantOp& op) {
 
   shared_ptr<ArtifactId> out = genTensor(op, out_shape);
 
-  allocate(out);
+  addToPersistentTensors(out);
   // Serialize the weights tensor and generate the function to deserialize it in the artifact.
   serializeTensor(out, data);
 }
@@ -526,7 +539,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
   // Create an instance of the CLReshapeLayer class as a member of the artifact class.
   auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
                         {AF::ref(in), AF::ref(out)});
-  allocate(out);
+  addToPersistentTensors(out);
   genLayerExecution(layer);
 }
 
@@ -600,19 +613,25 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
   auto layer2 = genLayer("arm_compute::CLArithmeticDivision",
                          operation_name + "_arithmetic_div_layer_2",
                          {AF::ref(transposed_input), AF::ref(tmp), AF::ref(transposed_output)});
-  allocate(scales);
+  addToPersistentTensors(scales);
   // Save the IR scales tensor to later read this in the artifact.
   serializeTensor(scales, ir_scales);
-  allocate(unit);
+  addToPersistentTensors(unit);
   // Fill the unit tensor with the 1 value.
   fillTensor(unit, "1");
-  allocate(tmp);
-  allocate(transposed_output);
+  addToPersistentTensors(tmp);
+  genTensorAllocation(_infBlock, transposed_output);
   genLayerExecution(layer2);
 
   // Generate output in NHWC format
   shared_ptr<ArtifactId> output =
       genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
   if (op.getNextNodes().empty())
     _outputs.insert(&op);
 }
@@ -696,7 +715,7 @@ void AclCppOpGenerator::visit(ops::PadOp& op) {
 template <typename Op>
 void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
   auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
-  const auto& ir_weights_shape = ir_weights.getShape();
+  const Shape& ir_weights_shape = ir_weights.getShape();
 
   auto& prev_nodes = op.getPrevNodes();
   assert(prev_nodes.size() == 1);
@@ -743,17 +762,22 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
 
   // Create the convolution (/depthwise convolution/deconvolution) layer class instance.
   shared_ptr<ArtifactId> layer = genLayer(acl_func_name, operation_name, config_params);
-  allocate(weights);
+  addToPersistentTensors(weights);
 
   // Save the IR weights tensor to later read this in the artifact.
   serializeTensor(weights, ir_weights);
-  allocate(transposed_output);
+  genTensorAllocation(_infBlock, transposed_output);
   genLayerExecution(layer);
 
   // Generate auxiliar tensor to hold transposed output of convolution in NHWC format
   shared_ptr<ArtifactId> output =
       genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
 
+  if (cli::debugTranspose) {
+    genTensorDeallocation(_infBlock, transposed_input);
+    genTensorDeallocation(_infBlock, transposed_output);
+  }
+
   if (op.getNextNodes().empty())
     _outputs.insert(&op);
 }
@@ -788,7 +812,7 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
   // Create an instance of the CLActivationLayer class as a member of the artifact class.
   auto layer = genLayer("arm_compute::CLActivationLayer", prefix,
                         {AF::ref(in), AF::ref(output), activation_info});
-  allocate(output);
+  addToPersistentTensors(output);
   genLayerExecution(layer);
 }
 
@@ -1000,13 +1024,25 @@ void AclCppOpGenerator::visit(ops::SqrtOp& op) {
   throw AclCppException("Unimplemented operation: Sqrt");
 }
 
-void AclCppOpGenerator::allocate(std::shared_ptr<ArtifactId> tensor_id) {
-  _allocates.push_back(tensor_id);
+void AclCppOpGenerator::addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id) {
+  _persistent_tensors.push_back(tensor_id);
 }
 
-void AclCppOpGenerator::genAllocates() {
-  for (auto a : _allocates)
-    _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref);
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorAllocation(ArtifactBlock* block,
+                                       const shared_ptr<ArtifactId>& tensor) {
+  return block->call("allocate", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorDeallocation(ArtifactBlock* block,
+                                         const shared_ptr<ArtifactId>& tensor) {
+  return block->call("free", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+void AclCppOpGenerator::genPersistentTensorAllocations() {
+  for (shared_ptr<ArtifactId> tensor: _persistent_tensors)
+    genTensorAllocation(_constrBlock, tensor);
 }
 
 shared_ptr<ArtifactId>
@@ -1032,7 +1068,8 @@ void AclCppOpGenerator::visit(mir::ops::ReduceFOp& op) {
 
 void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
                                      const std::shared_ptr<nnc::ArtifactId>& output,
-                                     const std::vector<size_t>& mir_perm) {
+                                     const std::vector<size_t>& mir_perm,
+                                     bool allocate_at_inference) {
 
   // acl 18.8 opencl implementation supports only 3 types of permutation:
   // in mir (0, 3, 1, 2),  in acl(axes are in reverse order) (1, 2, 0)
@@ -1061,7 +1098,10 @@ void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& inp
   string layer_name = out_name + "_transpose_layer";
   list<shared_ptr<ArtifactExpr>> arguments = {AF::ref(input), AF::ref(output), perm_vector};
   auto layer = genLayer("arm_compute::CLPermute", layer_name, arguments);
-  allocate(output);
+  if (allocate_at_inference)
+    genTensorAllocation(_infBlock, output);
+  else
+    addToPersistentTensors(output);
   genLayerExecution(layer);
 }
 
@@ -1081,7 +1121,7 @@ void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) {
   shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
 
   // Actual generation of operation and related stuff
-  genTranspose(input, output, mir_axis_order);
+  genTranspose(input, output, mir_axis_order, false);
 }
 
 void AclCppOpGenerator::visit(mir::ops::GatherOp& op) {
index b3dc23f..81bb72c 100644 (file)
@@ -84,7 +84,7 @@ private:
   /**
    * @brief generate transpose of input tensor NHWC -> NCHW
    * @param name name of tensor containing transposed data
-   * @param input_shape shape of @p inpu
+   * @param input_shape shape of @p input
    * @param input id of input tensor
    * @return Id of result tensor
    */
@@ -95,7 +95,7 @@ private:
   /**
    * @brief generate transpose NCHW -> NHWC
    * @param name name of tensor containing transposed data
-   * @param input_shape shape of @p inpu
+   * @param input_shape shape of @p input
    * @param input id of input tensor
    * @return Id of result tensor
    */
@@ -212,7 +212,8 @@ private:
    */
    void genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
                      const std::shared_ptr<nnc::ArtifactId>& output,
-                     const std::vector<size_t>& mir_perm);
+                     const std::vector<size_t>& mir_perm,
+                     bool allocate_at_inference);
 
   /**
    * @brief Generates accessors for the input/output tensors.
@@ -250,15 +251,31 @@ private:
   void fillTensor(std::shared_ptr<ArtifactId> tensor_id, const std::string& val);
 
   /**
-   * @brief Schedule the tensor allocation.
+   * @brief Schedule the tensor allocation in the artifact constructor.
    * @param tensor_id - ID of the scheduled tensor.
    */
-  void allocate(std::shared_ptr<ArtifactId> tensor_id);
+  void addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id);
+
+  /**
+   * @brief Generate allocation of tensor
+   * @param block Block to insert allocation in
+   * @param tensor Id of tensor to allocate
+   */
+  std::shared_ptr<ArtifactFunctionCall>
+  genTensorAllocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
+
+  /**
+   * @brief Generate deallocation of tensor
+   * @param block Block to insert deallocation in
+   * @param tensor Id of tensor to deallocate
+   */
+  std::shared_ptr<ArtifactFunctionCall>
+  genTensorDeallocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
 
   /**
    * @brief Generate all the scheduled tensor allocations.
    */
-  void genAllocates();
+  void genPersistentTensorAllocations();
 
   /**
    * @brief Generate the layer declaration and the configure() call.
@@ -337,17 +354,17 @@ private:
   std::shared_ptr<ArtifactId> _clScheduler;
 
   /**
-   * Tensors which need to be allocated in the artifact.
+   * @brief Tensors which need to be allocated at the artifact construction time.
    */
-  std::list<std::shared_ptr<ArtifactId>> _allocates;
+  std::list<std::shared_ptr<ArtifactId>> _persistent_tensors;
 
   /**
-   * Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
+   * @brief Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
    */
   std::list<std::shared_ptr<ArtifactId>> _serializations;
 
   /**
-   * Tensors which must be filled with constant values and the corresponding values.
+   * @brief Tensors which must be filled with constant values and the corresponding values.
    */
   std::list<std::pair<std::shared_ptr<ArtifactId>, std::string>> _fillings;
 };