From e5cde526949d96641769ecf67a6ccbf4fa417726 Mon Sep 17 00:00:00 2001 From: "Efimov Alexander/AI Tools Lab/./Samsung Electronics" Date: Wed, 9 Jan 2019 19:01:26 +0300 Subject: [PATCH] [nnc] Support transpose in acl backend (#2746) - Support transpose operations in acl backend - Insert transposes in acl backend where needed - Add test for transpose operation Signed-off-by: Efimov Alexander --- .../passes/acl_soft_backend/AclCppOpGenerator.cpp | 415 ++++++++++++++++----- .../passes/acl_soft_backend/AclCppOpGenerator.h | 44 ++- contrib/nnc/unittests/acl_backend/MIRToDOM.cpp | 31 +- 3 files changed, 387 insertions(+), 103 deletions(-) diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp index f7698ee..b0bb638 100644 --- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp +++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp @@ -45,6 +45,7 @@ #include "core/modelIR/operations/SoftmaxOp.h" #include "core/modelIR/operations/SqrtOp.h" #include "core/modelIR/operations/TanhOp.h" +#include "core/modelIR/operations/TransposeOp.h" #include "core/modelIR/operations/VariableOp.h" #include @@ -102,14 +103,26 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) { } void AclCppOpGenerator::visit(ops::ConcatOp& op) { - static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES", - "arm_compute::DataLayoutDimension::HEIGHT", - "arm_compute::DataLayoutDimension::WIDTH", - "arm_compute::DataLayoutDimension::CHANNEL"}; - - int axis = op.getAxis() < 0 ? op.getOutputShape(0).rank() + op.getAxis() : op.getAxis(); - assert(axis < sizeof(axis_names) / sizeof(const char*)); - auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); + + int axis = op.getAxis(); + assert(axis < 4 && axis >= 0 && "axis outside this range is not supported in ACL"); + + const char* axis_name; + if (cli::debugTranspose) { + static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES", + "arm_compute::DataLayoutDimension::CHANNEL", + "arm_compute::DataLayoutDimension::HEIGHT", + "arm_compute::DataLayoutDimension::WIDTH"}; + axis_name = axis_names[axis]; + } else { + static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES", + "arm_compute::DataLayoutDimension::HEIGHT", + "arm_compute::DataLayoutDimension::WIDTH", + "arm_compute::DataLayoutDimension::CHANNEL"}; + axis_name = axis_names[axis]; + } + + auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0))); auto prefix = out->name() + "_concatenate_layer"; auto inputs_var = _constrBlock->var("std::vector", prefix + "_inputs"); auto inputs = inputs_var->use(); @@ -118,10 +131,10 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) { _constrBlock->call("push_back", {AF::ref(AF::id(tensorName(i.op)))}, inputs); auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix, - {inputs, AF::ref(out), AF::lit(axis_names[axis])}); + {inputs, AF::ref(out), AF::lit(axis_name)}); allocate(out); - runLayer(layer); + genLayerExecution(layer); } void AclCppOpGenerator::visit(ops::Conv2DOp& op) { @@ -165,7 +178,20 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) { sm_shape.dim(0) = sm_dim; } - auto out = genTensor(op, in_out_shape); + Shape transposed_out_shape; + + switch (in_out_shape.rank()) { + case 4: + transposed_out_shape = transposeShape<3, 2, 1, 0>(in_out_shape); + break; + case 2: + transposed_out_shape = transposeShape<1, 0>(in_out_shape); + break; + default: + throw AclCppException("Unsupported number of dimensions in softmax"); + } + + auto out = genTensor(op, transposed_out_shape); auto prefix = out->name(); if (axis == 0) { @@ -174,7 +200,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) { auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer", {AF::ref(in), AF::ref(out)}); allocate(out); - runLayer(sm); + genLayerExecution(sm); } else { // Need to reshape before the Softmax application and after it. // Then we need two tensors for intermediate results. This is because we do a couple of auxiliary @@ -187,19 +213,19 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) { auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1", {AF::ref(in), AF::ref(tmp)}); allocate(tmp); - runLayer(transp1); + genLayerExecution(transp1); // Apply the softmax operaion. auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer", {AF::ref(tmp), AF::ref(tmp2)}); allocate(tmp2); - runLayer(sm); + genLayerExecution(sm); // Reshape the output to the original form. auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2", {AF::ref(tmp2), AF::ref(out)}); allocate(out); - runLayer(transp2); + genLayerExecution(transp2); } } @@ -239,6 +265,44 @@ static shared_ptr return pad_stride_info_var; } +shared_ptr +AclCppOpGenerator::genTransposeMIRtoACL(const string& name, + const Shape& input_shape, + const shared_ptr& input) { + + if (!cli::debugTranspose) { + // Generate output tensor description in the DOM. + shared_ptr output = AF::id(name); + + _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input}); + return output; + } + Shape transposed_shape = transposeShape<2, 1, 3, 0>(input_shape); + shared_ptr transposed_id = + genTensor(name, transposed_shape, false); + genTranspose(input, transposed_id, {0, 3, 1, 2}); + return transposed_id; +} + +shared_ptr +AclCppOpGenerator::genTransposeACLtoMIR(const string& name, + const Shape& input_shape, + const shared_ptr& input) { + + if (!cli::debugTranspose) { + // Generate output tensor description in the DOM. + shared_ptr output = AF::id(name); + + _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input}); + return output; + } + Shape transposed_shape = transposeShape<1, 3, 2, 0>(input_shape); + shared_ptr transposed_id = + genTensor(name, transposed_shape, false); + genTranspose(input, transposed_id, {0, 2, 3, 1}); + return transposed_id; +} + void AclCppOpGenerator::visit(ops::PoolOp& op) { const char* pooling_type = nullptr; @@ -257,26 +321,53 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) { assert(prev_nodes.size() == 1); auto in_op = prev_nodes[0].op; - auto in = AF::id(tensorName(in_op)); - auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); - auto prefix = out->name() + "_pooling_layer"; + string in_name = tensorName(in_op); + auto in_id = AF::id(in_name); - auto pad_stride_info_var = genPadStrideInfo(op, prefix, _constrBlock); + const string output_tensor_name = tensorName(&op); - auto pad_stride_info = pad_stride_info_var->use(); - auto kernel_window_var = _constrBlock->var("arm_compute::Size2D", prefix + "_kernel_window", {}, - {AF::lit(to_string(op.getWindowShape().dim(1))), - AF::lit(to_string(op.getWindowShape().dim(0)))}); - auto kernel_window = kernel_window_var->use(); - auto pooling_info_var = _constrBlock->var( - "arm_compute::PoolingLayerInfo", prefix + "_pooling_info", {}, + // Transpose data from MIR format to format compatible with ACL + const string transposed_input_name = output_tensor_name + "transposed_input"; + shared_ptr transposed_input = + genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), in_id); + + const string layer_name = output_tensor_name + "_pooling_layer"; + + shared_ptr pad_stride_info_var = + genPadStrideInfo(op, layer_name, _constrBlock); + + shared_ptr pad_stride_info = pad_stride_info_var->use(); + + // Create kernel window info + shared_ptr kernel_window_var = + _constrBlock->var("arm_compute::Size2D", layer_name + "_kernel_window", {}, + {AF::lit(to_string(op.getWindowShape().dim(1))), + AF::lit(to_string(op.getWindowShape().dim(0)))}); + shared_ptr kernel_window = kernel_window_var->use(); + + // Create pooling info: pooling type, kernel info, strides, etc + shared_ptr pooling_info_var = _constrBlock->var( + "arm_compute::PoolingLayerInfo", layer_name + "_pooling_info", {}, {AF::lit(pooling_type), kernel_window, pad_stride_info, AF::lit(op.getBorderType() == ops::PoolOp::BorderType::EMPTY ? "true" : "false")}); - auto pooling_info = pooling_info_var->use(); - auto layer = genLayer("arm_compute::CLPoolingLayer", prefix, - {AF::ref(in), AF::ref(out), pooling_info}); - allocate(out); - runLayer(layer); + shared_ptr pooling_info = pooling_info_var->use(); + + // Generate auxiliary tensor to hold transposed output of pool in NCHW format + Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0)); + shared_ptr transposed_output = + genTensor(layer_name + "_out_transpose", transposeShape<3, 2, 1, 0>(transposed_output_shape)); + + // Actual layer creation + shared_ptr layer = genLayer("arm_compute::CLPoolingLayer", layer_name, + {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info}); + allocate(transposed_output); + genLayerExecution(layer); + + shared_ptr output = + genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output); + + if (op.getNextNodes().empty()) + _outputs.insert(&op); } void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) { @@ -307,7 +398,7 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) { // Serialize the weights tensor and generate the function to deserialize it in the artifact. serializeTensor(weights, ir_weights); allocate(out); - runLayer(layer); + genLayerExecution(layer); } void AclCppOpGenerator::visit(ops::GemmOp& op) { @@ -327,28 +418,45 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) { auto in_op = prev_nodes[0].op; // Get the input node tensor id in the DOM. - auto in = AF::id(tensorName(in_op)); + shared_ptr input = AF::id(tensorName(in_op)); + + const string output_tensor_name = tensorName(&op); + + shared_ptr transposed_input; + Shape transposed_output_shape; + shared_ptr transposed_output; // Create the output tensor in the DOM and obtain its identifier. const Shape& out_shape = op.getOutputShape(0); - Shape transposed_shape; + const string transposed_output_name = output_tensor_name + "_transposed_output"; + switch (out_shape.rank()) { - case 4: - transposed_shape = transposeShape<2, 1, 3, 0>(out_shape); + case 4: { + // transpose input to NCHW format supported by ACL + const string transposed_input_name = output_tensor_name + "_transposed_input"; + transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape); + transposed_input = genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input); + + transposed_output = + genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape)); break; + } case 2: - transposed_shape = transposeShape<1, 0>(out_shape); + transposed_output_shape = out_shape; + transposed_input = input; + transposed_output = genTensor(tensorName(&op), transposeShape<1, 0>(transposed_output_shape)); break; case 1: - transposed_shape = out_shape; + transposed_output_shape = out_shape; + transposed_input = input; + transposed_output = genTensor(tensorName(&op), out_shape); break; default: throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank())); } - shared_ptr out = genTensor(op, transposed_shape); // Prefix used for the name of variables related to the operation implementation. - string operation_name = out->name() + "_bias_add_layer"; + string layer_name = transposed_output->name() + "_bias_add_layer"; // Reshape the IR biases tensor and generate the corresponding DOM tensor. const auto ir_input_shape = op.getInputShape(0); @@ -366,25 +474,40 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) { ir_biases_shape.dim(2) = ir_biases.getShape().dim(0); } - auto biases = genTensor(operation_name + "_biases", ir_biases_shape); + auto biases = genTensor(layer_name + "_biases", ir_biases_shape); // Instantiate the CLArithmeticAddition object. - auto layer = genLayer("arm_compute::CLArithmeticAddition", operation_name, - {AF::ref(in), AF::ref(biases), AF::ref(out), + auto layer = genLayer("arm_compute::CLArithmeticAddition", layer_name, + {AF::ref(transposed_input), AF::ref(biases), AF::ref(transposed_output), AF::lit("arm_compute::ConvertPolicy::WRAP")}); allocate(biases); // Save the IR biases tensor to later read this in the artifact. serializeTensor(biases, ir_biases); - allocate(out); - runLayer(layer); + allocate(transposed_output); + genLayerExecution(layer); + + if (out_shape.rank() == 4) { + // Generate output in NHWC format + shared_ptr output = + genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output); + } + + if (op.getNextNodes().empty()) + _outputs.insert(&op); } void AclCppOpGenerator::visit(ops::VariableOp& op) { shared_ptr tensor; - if (op.getOutputShape(0).rank() == 2) { - tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0))); + if (cli::debugTranspose) { + if (op.getOutputShape(0).rank() == 2) + tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0))); + else + tensor = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0))); } else { - tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); + if (op.getOutputShape(0).rank() == 2) + tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0))); + else + tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); } allocate(tensor); } @@ -458,7 +581,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) { auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer", {AF::ref(in), AF::ref(out)}); allocate(out); - runLayer(layer); + genLayerExecution(layer); } void AclCppOpGenerator::visit(ops::ScaleOp& op) { @@ -469,32 +592,42 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) { auto in_op = prev_nodes[0].op; // Get input tensor identifier in the generated artifact. - auto in = AF::id(tensorName(in_op)); + auto input = AF::id(tensorName(in_op)); + + const string output_tensor_name = tensorName(&op); + + // transpose input to NCHW format supported by ACL + const string transposed_input_name = output_tensor_name + "_transposed_input"; + shared_ptr transposed_input = + genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input); // Create the output tensor in the DOM and obtain its identifier. const Shape& out_shape = op.getOutputShape(0); - Shape transposed_shape; + Shape transposed_output_shape; switch (out_shape.rank()) { case 4: - transposed_shape = transposeShape<2, 1, 3, 0>(out_shape); + transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape); break; case 2: - transposed_shape = transposeShape<1, 0>(out_shape); + transposed_output_shape = transposeShape<1, 0>(out_shape); break; case 1: - transposed_shape = out_shape; + transposed_output_shape = out_shape; break; default: throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank())); } - shared_ptr out = genTensor(op, transposed_shape); - auto operation_name = out->name() + "_scale_layer"; + const string transposed_output_name = output_tensor_name + "_transposed_output"; + shared_ptr transposed_output = + genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape)); + + auto operation_name = transposed_output->name() + "_scale_layer"; const auto& ir_scales = op.getWeights(); // Reshape the IR scales tensor and generate the corresponding DOM tensor. - const auto ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0)); + const Shape ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0)); Shape ir_scales_shape(ir_input_shape.rank()); // ACL CLArithmeticDivision supports input tensors broadcasting. @@ -517,12 +650,12 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) { auto layer1 = genLayer("arm_compute::CLArithmeticDivision", operation_name + "_arithmetic_div_layer_1", {AF::ref(unit), AF::ref(scales), AF::ref(tmp)}); - runLayer(layer1); + genLayerExecution(layer1); // Create an instance of the CLArithmeticDivision class as a member of the artifact class. auto layer2 = genLayer("arm_compute::CLArithmeticDivision", operation_name + "_arithmetic_div_layer_2", - {AF::ref(in), AF::ref(tmp), AF::ref(out)}); + {AF::ref(transposed_input), AF::ref(tmp), AF::ref(transposed_output)}); allocate(scales); // Save the IR scales tensor to later read this in the artifact. serializeTensor(scales, ir_scales); @@ -530,12 +663,18 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) { // Fill the unit tensor with the 1 value. fillTensor(unit, "1"); allocate(tmp); - allocate(out); - runLayer(layer2); + allocate(transposed_output); + genLayerExecution(layer2); + + // Generate output in NHWC format + shared_ptr output = + genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output); + if (op.getNextNodes().empty()) + _outputs.insert(&op); } void AclCppOpGenerator::visit(mir::ops::SliceOp& op) { - assert(false && "Unimplemented operation: SliceOp"); + throw AclCppException( "Unimplemented operation: SliceOp"); } void AclCppOpGenerator::visit(ops::BatchNormOp& op) { @@ -564,7 +703,7 @@ void AclCppOpGenerator::visit(ops::TanhOp& op) { void AclCppOpGenerator::visit(ops::ElementwiseOp& op) { // Create the output tensor in the DOM and obtain its identifier. - auto out = genTensor(op, op.getOutputShape(0)); + auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0))); auto& prev_nodes = op.getPrevNodes(); assert(prev_nodes.size() >= 2); @@ -621,12 +760,23 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons assert(prev_nodes.size() == 1); auto in_op = prev_nodes[0].op; + // get output tensor name that is used as base for other names + const string output_tensor_name = tensorName(&op); + // Get the identifier of the input tensor in the DOM. - auto in = AF::id(tensorName(in_op)); + auto input = AF::id(tensorName(in_op)); - // Create the output tensor in the DOM. - auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); - string operation_name = out->name() + suffix; + // Generate auxiliary tensor to hold transposed input of convolution in NCHW format + shared_ptr transposed_input = + genTransposeMIRtoACL(output_tensor_name + "_transposed_input", op.getInputShape(0), input); + + // Create the transposed output tensor in the DOM. + const string transposed_output_name = output_tensor_name + "_transposed_output"; + Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0)); + shared_ptr transposed_output = + genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape)); + + string operation_name = output_tensor_name + suffix; // Generate a tensor for weights (kernel) in the DOM. auto weights = genTensor(operation_name + "_weights", ir_weights_shape); @@ -639,8 +789,9 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons // The parameter for the conv_layer.config(&in, &weights, nullptr, &out, pad_stride_info) // function call. - list> config_params{AF::ref(in), AF::ref(weights), AF::lit("nullptr"), - AF::ref(out), pad_stride_info}; + list> config_params{AF::ref(transposed_input), + AF::ref(weights), AF::lit("nullptr"), + AF::ref(transposed_output), pad_stride_info}; // Add to additional parameters for deconvolution. if (op.getType() == Operation::Type::deConv2D) { @@ -649,12 +800,20 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons } // Create the convolution (/depthwise convolution/deconvolution) layer class instance. - auto layer = genLayer(acl_func_name, operation_name, config_params); + shared_ptr layer = genLayer(acl_func_name, operation_name, config_params); allocate(weights); + // Save the IR weights tensor to later read this in the artifact. serializeTensor(weights, ir_weights); - allocate(out); - runLayer(layer); + allocate(transposed_output); + genLayerExecution(layer); + + // Generate auxiliar tensor to hold transposed output of convolution in NHWC format + shared_ptr output = + genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output); + + if (op.getNextNodes().empty()) + _outputs.insert(&op); } void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& activation_name, @@ -667,8 +826,13 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act auto in = AF::id(tensorName(in_op)); // Create the output tensor in the DOM and return its id. - auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); - auto prefix = out->name() + "_activation_layer"; + shared_ptr output; + if (cli::debugTranspose) + output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0))); + else + output = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0))); + + auto prefix = output->name() + "_activation_layer"; // Create an instance of the ActivationLayerInfo class as a local variable in the artifact // constructor. This instance profide information about the concrete activation function, @@ -681,9 +845,9 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act // Create an instance of the CLActivationLayer class as a member of the artifact class. auto layer = genLayer("arm_compute::CLActivationLayer", prefix, - {AF::ref(in), AF::ref(out), activation_info}); - allocate(out); - runLayer(layer); + {AF::ref(in), AF::ref(output), activation_info}); + allocate(output); + genLayerExecution(layer); } shared_ptr AclCppOpGenerator::genAddition(const string& prefix, int index, @@ -781,25 +945,37 @@ string AclCppOpGenerator::tensorName(const Operation* op) const { return tensor_name; } -std::shared_ptr AclCppOpGenerator::genShape(ArtifactBlock* block, const string& prefix, - const Shape& shape) { +template +std::shared_ptr +AclCppOpGenerator::genVectorInitializedVar(ArtifactBlock* block, const string& type, + const string& name, const vector & init) { list> dims; - for (int i = 0; i < shape.rank(); ++i) - dims.push_back(AF::lit(to_string(shape.dim(i)))); + for (int i = 0; i < init.size(); ++i) + dims.push_back(AF::lit(to_string(init[i]))); - auto shape_var = block->var("arm_compute::TensorShape", prefix + "_shape", {}, dims); + auto shape_var = block->var(type, name, {}, dims); auto shape_id = shape_var->use(); return shape_id; } -shared_ptr AclCppOpGenerator::genTensor(const string& name, const Shape& ir_shape, +shared_ptr AclCppOpGenerator::genTensor(const string& name, + const Shape& ir_shape, bool gen_accessor) { auto id = AF::id(name); if (_tensorNames.insert(name).second) { _artifactClass->var(false, "arm_compute::CLTensor", name); - auto shape = genShape(_constrBlock, name, ir_shape); + vector shape_vectorized; + + // create vector of initializers from Shape + shape_vectorized.reserve(ir_shape.rank()); + for (int i = 0; i < ir_shape.rank(); ++i) + shape_vectorized.push_back(ir_shape.dim(i)); + + const char* type_name = "arm_compute::TensorShape"; + shared_ptr shape = + genVectorInitializedVar(_constrBlock, type_name, name + "_shape", shape_vectorized); _constrBlock->call("initializeTensor", {id, shape}); if (gen_accessor) { @@ -895,11 +1071,11 @@ void AclCppOpGenerator::fillTensor(shared_ptr tensor_id, const strin } void AclCppOpGenerator::visit(ops::SqueezeOp& op) { - assert(false && "Unimplemented operation: Squeeze"); + throw AclCppException("Unimplemented operation: Squeeze"); } void AclCppOpGenerator::visit(ops::SqrtOp& op) { - assert(false && "Unimplemented operation: Sqrt"); + throw AclCppException("Unimplemented operation: Sqrt"); } void AclCppOpGenerator::allocate(std::shared_ptr tensor_id) { @@ -911,34 +1087,83 @@ void AclCppOpGenerator::genAllocates() { _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref); } -shared_ptr AclCppOpGenerator::genLayer( - const string& layer_type, - const string& layer_name, - const list>& config_params) { +shared_ptr +AclCppOpGenerator::genLayer(const string& layer_type, const string& layer_name, + const list>& config_params) { auto layer_var = _artifactClass->var(false, layer_type, layer_name); auto layer = layer_var->use(); _constrBlock->call("configure", config_params, layer); return layer; } -void AclCppOpGenerator::runLayer(shared_ptr layer_id) { +void AclCppOpGenerator::genLayerExecution(shared_ptr layer_id) { _infBlock->call("run", {}, layer_id); } void AclCppOpGenerator::visit(mir::ops::ResizeOp& op) { - assert(false && "Unimplemented operation: Resize"); + throw AclCppException("Unimplemented operation: Resize"); } void AclCppOpGenerator::visit(mir::ops::ReduceFOp& op) { - assert(false && "Unimplemented operation: ReduceFOp"); + throw AclCppException("Unimplemented operation: ReduceFOp"); +} + +void AclCppOpGenerator::genTranspose(const std::shared_ptr& input, + const std::shared_ptr& output, + const std::vector& mir_perm) { + + // acl 18.8 opencl implementation supports only 3 types of permutation: + // in mir (0, 3, 1, 2), in acl(axes are in reverse order) (1, 2, 0) + // in mir (0, 2, 3, 1), in acl (2, 0, 1) + // in mir (2, 3, 1, 0), in acl (3, 2, 0, 1) + // so here we try to transform mir transpose into one acl supports + + const string& out_name = output->name(); + vector acl_perm; + + if (mir_perm == vector{0, 3, 1, 2}) + acl_perm = {1, 2, 0}; + else if (mir_perm == vector{0, 2, 3, 1}) + acl_perm = {2, 0, 1}; + else if (mir_perm == vector{2, 3, 1, 0}) + acl_perm = {3, 2, 0, 1}; + else + throw AclCppException("Unsupported transpose sequence in operation " + out_name); + + // Create operation parameter containing permutation vector + shared_ptr perm_vector = + genVectorInitializedVar(_constrBlock, "arm_compute::PermutationVector", + out_name + "_perm_param", acl_perm); + + // Instantiate the CLPermute object. + string layer_name = out_name + "_transpose_layer"; + list> arguments = {AF::ref(input), AF::ref(output), perm_vector}; + auto layer = genLayer("arm_compute::CLPermute", layer_name, arguments); + allocate(output); + genLayerExecution(layer); } void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) { - assert(false && "Unimplemented operation: TransposeOp"); + auto& prev_nodes = op.getPrevNodes(); + assert(prev_nodes.size() == 1); + auto in_op = prev_nodes[0].op; + + // Get the input node tensor id in the DOM. + shared_ptr input = AF::id(tensorName(in_op)); + const vector& mir_axis_order = op.getAxisOrder(); + + // Create the output tensor in the DOM. + if (op.getOutputShape(0).rank() != 4) + throw AclCppException("Unsupported number of dimensions in transpose operation"); + // TODO replace transpose shape + shared_ptr output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0))); + + // Actual generation of operation and related stuff + genTranspose(input, output, mir_axis_order); } void AclCppOpGenerator::visit(mir::ops::GatherOp& op) { - assert(false && "Unimplemented operation: GatherOp"); + throw AclCppException("Unimplemented operation: GatherOp"); } void AclCppOpGenerator::visit(ops::SigmoidOp& op) { diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h index c306a2b..b3dc23f 100644 --- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h +++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h @@ -82,6 +82,27 @@ private: using AF = ArtifactFactory; /** + * @brief generate transpose of input tensor NHWC -> NCHW + * @param name name of tensor containing transposed data + * @param input_shape shape of @p inpu + * @param input id of input tensor + * @return Id of result tensor + */ + std::shared_ptr genTransposeMIRtoACL(const std::string& name, + const mir::Shape& input_shape, + const std::shared_ptr& input); + + /** + * @brief generate transpose NCHW -> NHWC + * @param name name of tensor containing transposed data + * @param input_shape shape of @p inpu + * @param input id of input tensor + * @return Id of result tensor + */ + std::shared_ptr genTransposeACLtoMIR(const std::string& name, + const mir::Shape& input_shape, + const std::shared_ptr& input); + /** * @brief The common part of the convolution and the depthwise convolution. */ template @@ -152,14 +173,17 @@ private: std::string tensorName(const mir::Operation* op) const; /** - * @brief Generates tensor shape in DOM. + * @brief Generates variables tensor shape in DOM. * @param block - DOM block where to create this shape: artifact constructor, inference function. - * @param prefix - prefix used for generating the unique name for this shape. + * @param name - prefix used for generating the unique name for this shape. * @param shape - model IR shape for which we create a DOM analogue. * @return - a DOM identifier for the created shape. */ - std::shared_ptr genShape(ArtifactBlock* block, const std::string& prefix, - const mir::Shape& shape); + template + std::shared_ptr genVectorInitializedVar(ArtifactBlock* block, + const std::string& type, + const std::string& name, + const std::vector& init); /** * @brief Generates a DOM tensor. @@ -181,6 +205,16 @@ private: std::shared_ptr genTensor(mir::Operation& op, const mir::Shape& ir_shape); /** + * @brief generate transposing operation, @p mir_perm contains dimensions in MIR order (batch has index 0) + * @param input id of input tensor + * @param output id out output tensor + * @param mir_perm new order of dimensions + */ + void genTranspose(const std::shared_ptr& input, + const std::shared_ptr& output, + const std::vector& mir_perm); + + /** * @brief Generates accessors for the input/output tensors. */ void genNamed(); @@ -240,7 +274,7 @@ private: * @brief Generate the layer run() call. * @param layer_id - layer ID. */ - void runLayer(std::shared_ptr layer_id); + void genLayerExecution(std::shared_ptr layer_id); /** * @brief Input nodes. diff --git a/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp b/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp index 2362b1d..0e18453 100644 --- a/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp +++ b/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp @@ -297,9 +297,6 @@ TEST(acl_backend_mir_to_dom, conv2d) { const ArtifactModule& m = dom_gen.generate(&g); checkDomStructure(m, {}, {}); - - stringstream code_out; - ArtifactGeneratorCppCode code_gen(code_out); } TEST(acl_backend_mir_to_dom, depthwise_conv) { @@ -503,3 +500,31 @@ TEST(acl_backend_mir_to_dom, reshape) { TEST(acl_backend_mir_to_dom, DISABLED_pad) { // TODO } + +TEST(acl_backend_mir_to_dom, transpose) { + const int32_t channels = 2; + TensorVariant w = createTensorVariant({channels}); + + vector perm{0, 3, 1, 2}; + + Graph g; + OpConstructor op_generator = [&perm](Graph& g, const vector& inputs) { + return g.create("transpose", inputs[0], perm); + }; + vector input_shapes{{1, 10, 10, channels}}; + + fillGraph(g, op_generator, input_shapes); + + stringstream params_out; + AclCppOpGenerator dom_gen(artifactName, params_out); + + const ArtifactModule& m = dom_gen.generate(&g); + + checkDomStructure(m, {}, {}); + + ArtifactGeneratorCppCode code_gen(std::cerr); + m.accept(&code_gen); + ArtifactGeneratorCppDecl decl_gen(std::cerr); + m.accept(&decl_gen); + +} -- 2.7.4