// Generate all the deferred entities.
genNamed();
- genAllocates();
+ genPersistentTensorAllocations();
genDeserializations();
genFillings();
auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
{inputs, AF::ref(out), AF::lit(axis_name)});
- allocate(out);
+ addToPersistentTensors(out);
genLayerExecution(layer);
}
// Apply the softmax operation.
auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
{AF::ref(in), AF::ref(output)});
- allocate(output);
+ addToPersistentTensors(output);
genLayerExecution(sm);
} else {
// TODO refactor this code, it works only with 1 batch
// Do the input permutation.
auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
{AF::ref(in), AF::ref(tmp)});
- allocate(tmp);
+ addToPersistentTensors(tmp);
genLayerExecution(transp1);
// Apply the softmax operaion.
auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
{AF::ref(tmp), AF::ref(tmp2)});
- allocate(tmp2);
+ addToPersistentTensors(tmp2);
genLayerExecution(sm);
// Reshape the output to the original form.
auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
{AF::ref(tmp2), AF::ref(output)});
- allocate(output);
+ addToPersistentTensors(output);
genLayerExecution(transp2);
}
}
Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
shared_ptr<ArtifactId> transposed_id =
genTensor(name, transposed_shape, false);
- genTranspose(input, transposed_id, {0, 3, 1, 2});
+ const bool allocate_at_inference = true;
+ genTranspose(input, transposed_id, {0, 3, 1, 2}, allocate_at_inference);
return transposed_id;
}
Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
shared_ptr<ArtifactId> transposed_id =
genTensor(name, transposed_shape, false);
- genTranspose(input, transposed_id, {0, 2, 3, 1});
+
+ const bool allocate_at_inference = false;
+ genTranspose(input, transposed_id, {0, 2, 3, 1}, allocate_at_inference);
return transposed_id;
}
// Actual layer creation
shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
{AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
- allocate(transposed_output);
+ genTensorAllocation(_infBlock, transposed_output);
genLayerExecution(layer);
shared_ptr<ArtifactId> output =
genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+ if (cli::debugTranspose) {
+ genTensorDeallocation(_infBlock, transposed_input);
+ genTensorDeallocation(_infBlock, transposed_output);
+ }
+
if (op.getNextNodes().empty())
_outputs.insert(&op);
}
// Instantiate the CLFullyConnectedLayer object.
auto layer = genLayer("arm_compute::CLFullyConnectedLayer", operation_name,
{AF::ref(in), AF::ref(weights), AF::lit("nullptr"), AF::ref(out)});
- allocate(weights);
+ addToPersistentTensors(weights);
// Serialize the weights tensor and generate the function to deserialize it in the artifact.
serializeTensor(weights, ir_weights);
- allocate(out);
+ addToPersistentTensors(out);
genLayerExecution(layer);
}
auto layer = genLayer("arm_compute::CLArithmeticAddition", layer_name,
{AF::ref(transposed_input), AF::ref(biases), AF::ref(transposed_output),
AF::lit("arm_compute::ConvertPolicy::WRAP")});
- allocate(biases);
+ addToPersistentTensors(biases);
// Save the IR biases tensor to later read this in the artifact.
serializeTensor(biases, ir_biases);
- allocate(transposed_output);
+ genTensorAllocation(_infBlock, transposed_output);
genLayerExecution(layer);
if (out_shape.rank() == 4) {
// Generate output in NHWC format
shared_ptr<ArtifactId> output =
genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+ if (cli::debugTranspose) {
+ genTensorDeallocation(_infBlock, transposed_input);
+ genTensorDeallocation(_infBlock, transposed_output);
+ }
}
if (op.getNextNodes().empty())
else
tensor = genTensor(op, op.getOutputShape(0));
}
- allocate(tensor);
+ addToPersistentTensors(tensor);
}
void AclCppOpGenerator::visit(ops::ConstantOp& op) {
shared_ptr<ArtifactId> out = genTensor(op, out_shape);
- allocate(out);
+ addToPersistentTensors(out);
// Serialize the weights tensor and generate the function to deserialize it in the artifact.
serializeTensor(out, data);
}
// Create an instance of the CLReshapeLayer class as a member of the artifact class.
auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
{AF::ref(in), AF::ref(out)});
- allocate(out);
+ addToPersistentTensors(out);
genLayerExecution(layer);
}
auto layer2 = genLayer("arm_compute::CLArithmeticDivision",
operation_name + "_arithmetic_div_layer_2",
{AF::ref(transposed_input), AF::ref(tmp), AF::ref(transposed_output)});
- allocate(scales);
+ addToPersistentTensors(scales);
// Save the IR scales tensor to later read this in the artifact.
serializeTensor(scales, ir_scales);
- allocate(unit);
+ addToPersistentTensors(unit);
// Fill the unit tensor with the 1 value.
fillTensor(unit, "1");
- allocate(tmp);
- allocate(transposed_output);
+ addToPersistentTensors(tmp);
+ genTensorAllocation(_infBlock, transposed_output);
genLayerExecution(layer2);
// Generate output in NHWC format
shared_ptr<ArtifactId> output =
genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+ if (cli::debugTranspose) {
+ genTensorDeallocation(_infBlock, transposed_input);
+ genTensorDeallocation(_infBlock, transposed_output);
+ }
+
if (op.getNextNodes().empty())
_outputs.insert(&op);
}
template <typename Op>
void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
- const auto& ir_weights_shape = ir_weights.getShape();
+ const Shape& ir_weights_shape = ir_weights.getShape();
auto& prev_nodes = op.getPrevNodes();
assert(prev_nodes.size() == 1);
// Create the convolution (/depthwise convolution/deconvolution) layer class instance.
shared_ptr<ArtifactId> layer = genLayer(acl_func_name, operation_name, config_params);
- allocate(weights);
+ addToPersistentTensors(weights);
// Save the IR weights tensor to later read this in the artifact.
serializeTensor(weights, ir_weights);
- allocate(transposed_output);
+ genTensorAllocation(_infBlock, transposed_output);
genLayerExecution(layer);
// Generate auxiliar tensor to hold transposed output of convolution in NHWC format
shared_ptr<ArtifactId> output =
genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+ if (cli::debugTranspose) {
+ genTensorDeallocation(_infBlock, transposed_input);
+ genTensorDeallocation(_infBlock, transposed_output);
+ }
+
if (op.getNextNodes().empty())
_outputs.insert(&op);
}
// Create an instance of the CLActivationLayer class as a member of the artifact class.
auto layer = genLayer("arm_compute::CLActivationLayer", prefix,
{AF::ref(in), AF::ref(output), activation_info});
- allocate(output);
+ addToPersistentTensors(output);
genLayerExecution(layer);
}
throw AclCppException("Unimplemented operation: Sqrt");
}
-void AclCppOpGenerator::allocate(std::shared_ptr<ArtifactId> tensor_id) {
- _allocates.push_back(tensor_id);
+void AclCppOpGenerator::addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id) {
+ _persistent_tensors.push_back(tensor_id);
}
-void AclCppOpGenerator::genAllocates() {
- for (auto a : _allocates)
- _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref);
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorAllocation(ArtifactBlock* block,
+ const shared_ptr<ArtifactId>& tensor) {
+ return block->call("allocate", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+shared_ptr<ArtifactFunctionCall>
+AclCppOpGenerator::genTensorDeallocation(ArtifactBlock* block,
+ const shared_ptr<ArtifactId>& tensor) {
+ return block->call("free", {}, AF::call("allocator", {}, tensor), ArtifactCallType::ref);
+}
+
+void AclCppOpGenerator::genPersistentTensorAllocations() {
+ for (shared_ptr<ArtifactId> tensor: _persistent_tensors)
+ genTensorAllocation(_constrBlock, tensor);
}
shared_ptr<ArtifactId>
void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
const std::shared_ptr<nnc::ArtifactId>& output,
- const std::vector<size_t>& mir_perm) {
+ const std::vector<size_t>& mir_perm,
+ bool allocate_at_inference) {
// acl 18.8 opencl implementation supports only 3 types of permutation:
// in mir (0, 3, 1, 2), in acl(axes are in reverse order) (1, 2, 0)
string layer_name = out_name + "_transpose_layer";
list<shared_ptr<ArtifactExpr>> arguments = {AF::ref(input), AF::ref(output), perm_vector};
auto layer = genLayer("arm_compute::CLPermute", layer_name, arguments);
- allocate(output);
+ if (allocate_at_inference)
+ genTensorAllocation(_infBlock, output);
+ else
+ addToPersistentTensors(output);
genLayerExecution(layer);
}
shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
// Actual generation of operation and related stuff
- genTranspose(input, output, mir_axis_order);
+ genTranspose(input, output, mir_axis_order, false);
}
void AclCppOpGenerator::visit(mir::ops::GatherOp& op) {
/**
* @brief generate transpose of input tensor NHWC -> NCHW
* @param name name of tensor containing transposed data
- * @param input_shape shape of @p inpu
+ * @param input_shape shape of @p input
* @param input id of input tensor
* @return Id of result tensor
*/
/**
* @brief generate transpose NCHW -> NHWC
* @param name name of tensor containing transposed data
- * @param input_shape shape of @p inpu
+ * @param input_shape shape of @p input
* @param input id of input tensor
* @return Id of result tensor
*/
*/
void genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
const std::shared_ptr<nnc::ArtifactId>& output,
- const std::vector<size_t>& mir_perm);
+ const std::vector<size_t>& mir_perm,
+ bool allocate_at_inference);
/**
* @brief Generates accessors for the input/output tensors.
void fillTensor(std::shared_ptr<ArtifactId> tensor_id, const std::string& val);
/**
- * @brief Schedule the tensor allocation.
+ * @brief Schedule the tensor allocation in the artifact constructor.
* @param tensor_id - ID of the scheduled tensor.
*/
- void allocate(std::shared_ptr<ArtifactId> tensor_id);
+ void addToPersistentTensors(std::shared_ptr<ArtifactId> tensor_id);
+
+ /**
+ * @brief Generate allocation of tensor
+ * @param block Block to insert allocation in
+ * @param tensor Id of tensor to allocate
+ */
+ std::shared_ptr<ArtifactFunctionCall>
+ genTensorAllocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
+
+ /**
+ * @brief Generate deallocation of tensor
+ * @param block Block to insert deallocation in
+ * @param tensor Id of tensor to deallocate
+ */
+ std::shared_ptr<ArtifactFunctionCall>
+ genTensorDeallocation(ArtifactBlock* block, const std::shared_ptr<ArtifactId>& tensor);
/**
* @brief Generate all the scheduled tensor allocations.
*/
- void genAllocates();
+ void genPersistentTensorAllocations();
/**
* @brief Generate the layer declaration and the configure() call.
std::shared_ptr<ArtifactId> _clScheduler;
/**
- * Tensors which need to be allocated in the artifact.
+ * @brief Tensors which need to be allocated at the artifact construction time.
*/
- std::list<std::shared_ptr<ArtifactId>> _allocates;
+ std::list<std::shared_ptr<ArtifactId>> _persistent_tensors;
/**
- * Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
+ * @brief Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
*/
std::list<std::shared_ptr<ArtifactId>> _serializations;
/**
- * Tensors which must be filled with constant values and the corresponding values.
+ * @brief Tensors which must be filled with constant values and the corresponding values.
*/
std::list<std::pair<std::shared_ptr<ArtifactId>, std::string>> _fillings;
};