From e5cde526949d96641769ecf67a6ccbf4fa417726 Mon Sep 17 00:00:00 2001
From: "Efimov Alexander/AI Tools Lab/./Samsung Electronics"
 <a.efimov@samsung.com>
Date: Wed, 9 Jan 2019 19:01:26 +0300
Subject: [PATCH] [nnc] Support transpose in acl backend (#2746)

- Support transpose operations in acl backend
- Insert transposes in acl backend where needed
- Add test for transpose operation

Signed-off-by: Efimov Alexander <a.efimov@samsung.com>
---
 .../passes/acl_soft_backend/AclCppOpGenerator.cpp  | 415 ++++++++++++++++-----
 .../passes/acl_soft_backend/AclCppOpGenerator.h    |  44 ++-
 contrib/nnc/unittests/acl_backend/MIRToDOM.cpp     |  31 +-
 3 files changed, 387 insertions(+), 103 deletions(-)
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
index f7698ee..b0bb638 100644
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
@@ -45,6 +45,7 @@
 #include "core/modelIR/operations/SoftmaxOp.h"
 #include "core/modelIR/operations/SqrtOp.h"
 #include "core/modelIR/operations/TanhOp.h"
+#include "core/modelIR/operations/TransposeOp.h"
 #include "core/modelIR/operations/VariableOp.h"
 
 #include <algorithm>
@@ -102,14 +103,26 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
 }
 
 void AclCppOpGenerator::visit(ops::ConcatOp& op) {
-  static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES",
-                                     "arm_compute::DataLayoutDimension::HEIGHT",
-                                     "arm_compute::DataLayoutDimension::WIDTH",
-                                     "arm_compute::DataLayoutDimension::CHANNEL"};
-
-  int axis = op.getAxis() < 0 ? op.getOutputShape(0).rank() + op.getAxis() : op.getAxis();
-  assert(axis < sizeof(axis_names) / sizeof(const char*));
-  auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+
+  int axis = op.getAxis();
+  assert(axis < 4 && axis >= 0 && "axis outside this range is not supported in ACL");
+
+  const char* axis_name;
+  if (cli::debugTranspose) {
+    static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES",
+                                       "arm_compute::DataLayoutDimension::CHANNEL",
+                                       "arm_compute::DataLayoutDimension::HEIGHT",
+                                       "arm_compute::DataLayoutDimension::WIDTH"};
+    axis_name = axis_names[axis];
+  } else {
+    static const char* axis_names[] = {"arm_compute::DataLayoutDimension::BATCHES",
+                                       "arm_compute::DataLayoutDimension::HEIGHT",
+                                       "arm_compute::DataLayoutDimension::WIDTH",
+                                       "arm_compute::DataLayoutDimension::CHANNEL"};
+    axis_name = axis_names[axis];
+  }
+
+  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
   auto prefix = out->name() + "_concatenate_layer";
   auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
   auto inputs = inputs_var->use();
@@ -118,10 +131,10 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
     _constrBlock->call("push_back", {AF::ref(AF::id(tensorName(i.op)))}, inputs);
 
   auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
-                        {inputs, AF::ref(out), AF::lit(axis_names[axis])});
+                        {inputs, AF::ref(out), AF::lit(axis_name)});
 
   allocate(out);
-  runLayer(layer);
+  genLayerExecution(layer);
 }
 
 void AclCppOpGenerator::visit(ops::Conv2DOp& op) {
@@ -165,7 +178,20 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     sm_shape.dim(0) = sm_dim;
   }
 
-  auto out = genTensor(op, in_out_shape);
+  Shape transposed_out_shape;
+
+  switch (in_out_shape.rank()) {
+    case 4:
+      transposed_out_shape = transposeShape<3, 2, 1, 0>(in_out_shape);
+      break;
+    case 2:
+      transposed_out_shape = transposeShape<1, 0>(in_out_shape);
+      break;
+    default:
+      throw AclCppException("Unsupported number of dimensions in softmax");
+  }
+
+  auto out = genTensor(op, transposed_out_shape);
   auto prefix = out->name();
 
   if (axis == 0) {
@@ -174,7 +200,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
                        {AF::ref(in), AF::ref(out)});
     allocate(out);
-    runLayer(sm);
+    genLayerExecution(sm);
   } else {
     // Need to reshape before the Softmax application and after it.
     // Then we need two tensors for intermediate results. This is because we do a couple of auxiliary
@@ -187,19 +213,19 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1",
                           {AF::ref(in), AF::ref(tmp)});
     allocate(tmp);
-    runLayer(transp1);
+    genLayerExecution(transp1);
 
     // Apply the softmax operaion.
     auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
                        {AF::ref(tmp), AF::ref(tmp2)});
     allocate(tmp2);
-    runLayer(sm);
+    genLayerExecution(sm);
 
     // Reshape the output to the original form.
     auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2",
                           {AF::ref(tmp2), AF::ref(out)});
     allocate(out);
-    runLayer(transp2);
+    genLayerExecution(transp2);
   }
 }
 
@@ -239,6 +265,44 @@ static shared_ptr<ArtifactVariable>
   return pad_stride_info_var;
 }
 
+shared_ptr<ArtifactId>
+AclCppOpGenerator::genTransposeMIRtoACL(const string& name,
+                                        const Shape& input_shape,
+                                        const shared_ptr<ArtifactId>& input) {
+
+  if (!cli::debugTranspose) {
+    // Generate output tensor description in the DOM.
+    shared_ptr<ArtifactId> output = AF::id(name);
+
+    _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
+    return output;
+  }
+  Shape transposed_shape = transposeShape<2, 1, 3, 0>(input_shape);
+  shared_ptr<ArtifactId> transposed_id =
+      genTensor(name, transposed_shape, false);
+  genTranspose(input, transposed_id, {0, 3, 1, 2});
+  return transposed_id;
+}
+
+shared_ptr<ArtifactId>
+AclCppOpGenerator::genTransposeACLtoMIR(const string& name,
+                                        const Shape& input_shape,
+                                        const shared_ptr<ArtifactId>& input) {
+
+  if (!cli::debugTranspose) {
+    // Generate output tensor description in the DOM.
+    shared_ptr<ArtifactId> output = AF::id(name);
+
+    _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
+    return output;
+  }
+  Shape transposed_shape = transposeShape<1, 3, 2, 0>(input_shape);
+  shared_ptr<ArtifactId> transposed_id =
+      genTensor(name, transposed_shape, false);
+  genTranspose(input, transposed_id, {0, 2, 3, 1});
+  return transposed_id;
+}
+
 void AclCppOpGenerator::visit(ops::PoolOp& op) {
   const char* pooling_type = nullptr;
 
@@ -257,26 +321,53 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
   assert(prev_nodes.size() == 1);
 
   auto in_op = prev_nodes[0].op;
-  auto in = AF::id(tensorName(in_op));
-  auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
-  auto prefix = out->name() + "_pooling_layer";
+  string in_name = tensorName(in_op);
+  auto in_id = AF::id(in_name);
 
-  auto pad_stride_info_var = genPadStrideInfo(op, prefix, _constrBlock);
+  const string output_tensor_name = tensorName(&op);
 
-  auto pad_stride_info = pad_stride_info_var->use();
-  auto kernel_window_var = _constrBlock->var("arm_compute::Size2D", prefix + "_kernel_window", {},
-                                             {AF::lit(to_string(op.getWindowShape().dim(1))),
-                                              AF::lit(to_string(op.getWindowShape().dim(0)))});
-  auto kernel_window = kernel_window_var->use();
-  auto pooling_info_var = _constrBlock->var(
-                "arm_compute::PoolingLayerInfo", prefix + "_pooling_info", {},
+  // Transpose data from MIR format to format compatible with ACL
+  const string transposed_input_name = output_tensor_name + "transposed_input";
+  shared_ptr<ArtifactId> transposed_input =
+      genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), in_id);
+
+  const string layer_name = output_tensor_name + "_pooling_layer";
+
+  shared_ptr<ArtifactVariable> pad_stride_info_var =
+      genPadStrideInfo(op, layer_name, _constrBlock);
+
+  shared_ptr<ArtifactId> pad_stride_info = pad_stride_info_var->use();
+
+  // Create kernel window info
+  shared_ptr<ArtifactVariable> kernel_window_var =
+      _constrBlock->var("arm_compute::Size2D", layer_name + "_kernel_window", {},
+                        {AF::lit(to_string(op.getWindowShape().dim(1))),
+                        AF::lit(to_string(op.getWindowShape().dim(0)))});
+  shared_ptr<ArtifactId> kernel_window = kernel_window_var->use();
+
+  // Create pooling info: pooling type, kernel info, strides, etc
+  shared_ptr<ArtifactVariable> pooling_info_var = _constrBlock->var(
+                "arm_compute::PoolingLayerInfo", layer_name + "_pooling_info", {},
                 {AF::lit(pooling_type), kernel_window, pad_stride_info,
                  AF::lit(op.getBorderType() == ops::PoolOp::BorderType::EMPTY ? "true" : "false")});
-  auto pooling_info = pooling_info_var->use();
-  auto layer = genLayer("arm_compute::CLPoolingLayer", prefix,
-                        {AF::ref(in), AF::ref(out), pooling_info});
-  allocate(out);
-  runLayer(layer);
+  shared_ptr<ArtifactId> pooling_info = pooling_info_var->use();
+
+  // Generate auxiliary tensor to hold transposed output of pool in NCHW format
+  Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
+  shared_ptr<ArtifactId> transposed_output =
+      genTensor(layer_name + "_out_transpose", transposeShape<3, 2, 1, 0>(transposed_output_shape));
+
+  // Actual layer creation
+  shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
+      {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
+  allocate(transposed_output);
+  genLayerExecution(layer);
+
+  shared_ptr<ArtifactId> output =
+      genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+  if (op.getNextNodes().empty())
+    _outputs.insert(&op);
 }
 
 void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
@@ -307,7 +398,7 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
   // Serialize the weights tensor and generate the function to deserialize it in the artifact.
   serializeTensor(weights, ir_weights);
   allocate(out);
-  runLayer(layer);
+  genLayerExecution(layer);
 }
 
 void AclCppOpGenerator::visit(ops::GemmOp& op) {
@@ -327,28 +418,45 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
   auto in_op = prev_nodes[0].op;
 
   // Get the input node tensor id in the DOM.
-  auto in = AF::id(tensorName(in_op));
+  shared_ptr<ArtifactId> input = AF::id(tensorName(in_op));
+
+  const string output_tensor_name = tensorName(&op);
+
+  shared_ptr<ArtifactId> transposed_input;
+  Shape transposed_output_shape;
+  shared_ptr<ArtifactId> transposed_output;
 
   // Create the output tensor in the DOM and obtain its identifier.
   const Shape& out_shape = op.getOutputShape(0);
-  Shape transposed_shape;
+  const string transposed_output_name = output_tensor_name + "_transposed_output";
+
   switch (out_shape.rank()) {
-    case 4:
-      transposed_shape = transposeShape<2, 1, 3, 0>(out_shape);
+    case 4: {
+      // transpose input to NCHW format supported by ACL
+      const string transposed_input_name = output_tensor_name + "_transposed_input";
+      transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape);
+      transposed_input = genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input);
+
+      transposed_output =
+          genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
       break;
+    }
     case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
+      transposed_output_shape = out_shape;
+      transposed_input = input;
+      transposed_output = genTensor(tensorName(&op), transposeShape<1, 0>(transposed_output_shape));
       break;
     case 1:
-      transposed_shape = out_shape;
+      transposed_output_shape = out_shape;
+      transposed_input = input;
+      transposed_output = genTensor(tensorName(&op), out_shape);
       break;
     default:
       throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
   }
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
 
   // Prefix used for the name of variables related to the operation implementation.
-  string operation_name = out->name() + "_bias_add_layer";
+  string layer_name = transposed_output->name() + "_bias_add_layer";
 
   // Reshape the IR biases tensor and generate the corresponding DOM tensor.
   const auto ir_input_shape = op.getInputShape(0);
@@ -366,25 +474,40 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
 
     ir_biases_shape.dim(2) = ir_biases.getShape().dim(0);
   }
-  auto biases = genTensor(operation_name + "_biases", ir_biases_shape);
+  auto biases = genTensor(layer_name + "_biases", ir_biases_shape);
 
   // Instantiate the CLArithmeticAddition object.
-  auto layer = genLayer("arm_compute::CLArithmeticAddition", operation_name,
-                        {AF::ref(in), AF::ref(biases), AF::ref(out),
+  auto layer = genLayer("arm_compute::CLArithmeticAddition", layer_name,
+                        {AF::ref(transposed_input), AF::ref(biases), AF::ref(transposed_output),
                          AF::lit("arm_compute::ConvertPolicy::WRAP")});
   allocate(biases);
   // Save the IR biases tensor to later read this in the artifact.
   serializeTensor(biases, ir_biases);
-  allocate(out);
-  runLayer(layer);
+  allocate(transposed_output);
+  genLayerExecution(layer);
+
+  if (out_shape.rank() == 4) {
+    // Generate output in NHWC format
+    shared_ptr<ArtifactId> output =
+        genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+  }
+
+  if (op.getNextNodes().empty())
+    _outputs.insert(&op);
 }
 
 void AclCppOpGenerator::visit(ops::VariableOp& op) {
   shared_ptr<ArtifactId> tensor;
-  if (op.getOutputShape(0).rank() == 2) {
-    tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+  if (cli::debugTranspose) {
+    if (op.getOutputShape(0).rank() == 2)
+      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+    else
+      tensor = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
   } else {
-    tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+    if (op.getOutputShape(0).rank() == 2)
+      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+    else
+      tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
   }
   allocate(tensor);
 }
@@ -458,7 +581,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
   auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
                         {AF::ref(in), AF::ref(out)});
   allocate(out);
-  runLayer(layer);
+  genLayerExecution(layer);
 }
 
 void AclCppOpGenerator::visit(ops::ScaleOp& op) {
@@ -469,32 +592,42 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
   auto in_op = prev_nodes[0].op;
 
   // Get input tensor identifier in the generated artifact.
-  auto in = AF::id(tensorName(in_op));
+  auto input = AF::id(tensorName(in_op));
+
+  const string output_tensor_name = tensorName(&op);
+
+  // transpose input to NCHW format supported by ACL
+  const string transposed_input_name = output_tensor_name + "_transposed_input";
+  shared_ptr<ArtifactId> transposed_input =
+      genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input);
 
   // Create the output tensor in the DOM and obtain its identifier.
   const Shape& out_shape = op.getOutputShape(0);
-  Shape transposed_shape;
+  Shape transposed_output_shape;
   switch (out_shape.rank()) {
     case 4:
-      transposed_shape = transposeShape<2, 1, 3, 0>(out_shape);
+      transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape);
       break;
     case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
+      transposed_output_shape = transposeShape<1, 0>(out_shape);
       break;
     case 1:
-      transposed_shape = out_shape;
+      transposed_output_shape = out_shape;
       break;
     default:
       throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
   }
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
 
-  auto operation_name = out->name() + "_scale_layer";
+  const string transposed_output_name = output_tensor_name + "_transposed_output";
+  shared_ptr<ArtifactId> transposed_output =
+      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+
+  auto operation_name = transposed_output->name() + "_scale_layer";
 
   const auto& ir_scales = op.getWeights();
 
   // Reshape the IR scales tensor and generate the corresponding DOM tensor.
-  const auto ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0));
+  const Shape ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0));
   Shape ir_scales_shape(ir_input_shape.rank());
 
   // ACL CLArithmeticDivision supports input tensors broadcasting.
@@ -517,12 +650,12 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
   auto layer1 = genLayer("arm_compute::CLArithmeticDivision",
                          operation_name + "_arithmetic_div_layer_1",
                          {AF::ref(unit), AF::ref(scales), AF::ref(tmp)});
-  runLayer(layer1);
+  genLayerExecution(layer1);
 
   // Create an instance of the CLArithmeticDivision class as a member of the artifact class.
   auto layer2 = genLayer("arm_compute::CLArithmeticDivision",
                          operation_name + "_arithmetic_div_layer_2",
-                         {AF::ref(in), AF::ref(tmp), AF::ref(out)});
+                         {AF::ref(transposed_input), AF::ref(tmp), AF::ref(transposed_output)});
   allocate(scales);
   // Save the IR scales tensor to later read this in the artifact.
   serializeTensor(scales, ir_scales);
@@ -530,12 +663,18 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
   // Fill the unit tensor with the 1 value.
   fillTensor(unit, "1");
   allocate(tmp);
-  allocate(out);
-  runLayer(layer2);
+  allocate(transposed_output);
+  genLayerExecution(layer2);
+
+  // Generate output in NHWC format
+  shared_ptr<ArtifactId> output =
+      genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+  if (op.getNextNodes().empty())
+    _outputs.insert(&op);
 }
 
 void AclCppOpGenerator::visit(mir::ops::SliceOp& op) {
-  assert(false && "Unimplemented operation: SliceOp");
+  throw AclCppException( "Unimplemented operation: SliceOp");
 }
 
 void AclCppOpGenerator::visit(ops::BatchNormOp& op) {
@@ -564,7 +703,7 @@ void AclCppOpGenerator::visit(ops::TanhOp& op) {
 
 void AclCppOpGenerator::visit(ops::ElementwiseOp& op) {
   // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
 
   auto& prev_nodes = op.getPrevNodes();
   assert(prev_nodes.size() >= 2);
@@ -621,12 +760,23 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
   assert(prev_nodes.size() == 1);
   auto in_op = prev_nodes[0].op;
 
+  // get output tensor name that is used as base for other names
+  const string output_tensor_name = tensorName(&op);
+
   // Get the identifier of the input tensor in the DOM.
-  auto in = AF::id(tensorName(in_op));
+  auto input = AF::id(tensorName(in_op));
 
-  // Create the output tensor in the DOM.
-  auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
-  string operation_name = out->name() + suffix;
+  // Generate auxiliary tensor to hold transposed input of convolution in NCHW format
+  shared_ptr<ArtifactId> transposed_input =
+      genTransposeMIRtoACL(output_tensor_name + "_transposed_input", op.getInputShape(0), input);
+
+  // Create the transposed output tensor in the DOM.
+  const string transposed_output_name = output_tensor_name + "_transposed_output";
+  Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
+  shared_ptr<ArtifactId> transposed_output =
+      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+
+  string operation_name = output_tensor_name + suffix;
 
   // Generate a tensor for weights (kernel) in the DOM.
   auto weights = genTensor(operation_name + "_weights", ir_weights_shape);
@@ -639,8 +789,9 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
 
   // The parameter for the conv_layer.config(&in, &weights, nullptr, &out, pad_stride_info)
   // function call.
-  list<shared_ptr<ArtifactExpr>> config_params{AF::ref(in), AF::ref(weights), AF::lit("nullptr"),
-                                               AF::ref(out), pad_stride_info};
+  list<shared_ptr<ArtifactExpr>> config_params{AF::ref(transposed_input),
+                                               AF::ref(weights), AF::lit("nullptr"),
+                                               AF::ref(transposed_output), pad_stride_info};
 
   // Add to additional parameters for deconvolution.
   if (op.getType() == Operation::Type::deConv2D) {
@@ -649,12 +800,20 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
   }
 
   // Create the convolution (/depthwise convolution/deconvolution) layer class instance.
-  auto layer = genLayer(acl_func_name, operation_name, config_params);
+  shared_ptr<ArtifactId> layer = genLayer(acl_func_name, operation_name, config_params);
   allocate(weights);
+
   // Save the IR weights tensor to later read this in the artifact.
   serializeTensor(weights, ir_weights);
-  allocate(out);
-  runLayer(layer);
+  allocate(transposed_output);
+  genLayerExecution(layer);
+
+  // Generate auxiliar tensor to hold transposed output of convolution in NHWC format
+  shared_ptr<ArtifactId> output =
+      genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+
+  if (op.getNextNodes().empty())
+    _outputs.insert(&op);
 }
 
 void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& activation_name,
@@ -667,8 +826,13 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
   auto in = AF::id(tensorName(in_op));
 
   // Create the output tensor in the DOM and return its id.
-  auto out = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
-  auto prefix = out->name() + "_activation_layer";
+  shared_ptr<ArtifactId> output;
+  if (cli::debugTranspose)
+    output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  else
+    output = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+
+  auto prefix = output->name() + "_activation_layer";
 
   // Create an instance of the ActivationLayerInfo class as a local variable in the artifact
   // constructor. This instance profide information about the concrete activation function,
@@ -681,9 +845,9 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
 
   // Create an instance of the CLActivationLayer class as a member of the artifact class.
   auto layer = genLayer("arm_compute::CLActivationLayer", prefix,
-                        {AF::ref(in), AF::ref(out), activation_info});
-  allocate(out);
-  runLayer(layer);
+                        {AF::ref(in), AF::ref(output), activation_info});
+  allocate(output);
+  genLayerExecution(layer);
 }
 
 shared_ptr<ArtifactId> AclCppOpGenerator::genAddition(const string& prefix, int index,
@@ -781,25 +945,37 @@ string AclCppOpGenerator::tensorName(const Operation* op) const {
   return tensor_name;
 }
 
-std::shared_ptr<ArtifactId> AclCppOpGenerator::genShape(ArtifactBlock* block, const string& prefix,
-                                                        const Shape& shape) {
+template <typename T>
+std::shared_ptr<ArtifactId>
+AclCppOpGenerator::genVectorInitializedVar(ArtifactBlock* block, const string& type,
+                                           const string& name, const vector <T>& init) {
   list<shared_ptr<ArtifactExpr>> dims;
 
-  for (int i = 0; i < shape.rank(); ++i)
-    dims.push_back(AF::lit(to_string(shape.dim(i))));
+  for (int i = 0; i < init.size(); ++i)
+    dims.push_back(AF::lit(to_string(init[i])));
 
-  auto shape_var = block->var("arm_compute::TensorShape", prefix + "_shape", {}, dims);
+  auto shape_var = block->var(type, name, {}, dims);
   auto shape_id = shape_var->use();
   return shape_id;
 }
 
-shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name, const Shape& ir_shape,
+shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name,
+                                                    const Shape& ir_shape,
                                                     bool gen_accessor) {
   auto id = AF::id(name);
 
   if (_tensorNames.insert(name).second) {
     _artifactClass->var(false, "arm_compute::CLTensor", name);
-    auto shape = genShape(_constrBlock, name, ir_shape);
+    vector<int32_t> shape_vectorized;
+
+    // create vector of initializers from Shape
+    shape_vectorized.reserve(ir_shape.rank());
+    for (int i = 0; i < ir_shape.rank(); ++i)
+      shape_vectorized.push_back(ir_shape.dim(i));
+
+    const char* type_name = "arm_compute::TensorShape";
+    shared_ptr<ArtifactId> shape =
+        genVectorInitializedVar(_constrBlock, type_name, name + "_shape", shape_vectorized);
     _constrBlock->call("initializeTensor", {id, shape});
 
     if (gen_accessor) {
@@ -895,11 +1071,11 @@ void AclCppOpGenerator::fillTensor(shared_ptr<ArtifactId> tensor_id, const strin
 }
 
 void AclCppOpGenerator::visit(ops::SqueezeOp& op) {
-  assert(false && "Unimplemented operation: Squeeze");
+  throw AclCppException("Unimplemented operation: Squeeze");
 }
 
 void AclCppOpGenerator::visit(ops::SqrtOp& op) {
-  assert(false && "Unimplemented operation: Sqrt");
+  throw AclCppException("Unimplemented operation: Sqrt");
 }
 
 void AclCppOpGenerator::allocate(std::shared_ptr<ArtifactId> tensor_id) {
@@ -911,34 +1087,83 @@ void AclCppOpGenerator::genAllocates() {
     _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref);
 }
 
-shared_ptr<ArtifactId> AclCppOpGenerator::genLayer(
-                                              const string& layer_type,
-                                              const string& layer_name,
-                                              const list<shared_ptr<ArtifactExpr>>& config_params) {
+shared_ptr<ArtifactId>
+AclCppOpGenerator::genLayer(const string& layer_type, const string& layer_name,
+                            const list<shared_ptr<ArtifactExpr>>& config_params) {
   auto layer_var = _artifactClass->var(false, layer_type, layer_name);
   auto layer = layer_var->use();
   _constrBlock->call("configure", config_params, layer);
   return layer;
 }
 
-void AclCppOpGenerator::runLayer(shared_ptr<ArtifactId> layer_id) {
+void AclCppOpGenerator::genLayerExecution(shared_ptr<ArtifactId> layer_id) {
   _infBlock->call("run", {}, layer_id);
 }
 
 void AclCppOpGenerator::visit(mir::ops::ResizeOp& op) {
-  assert(false && "Unimplemented operation: Resize");
+  throw AclCppException("Unimplemented operation: Resize");
 }
 
 void AclCppOpGenerator::visit(mir::ops::ReduceFOp& op) {
-  assert(false && "Unimplemented operation: ReduceFOp");
+  throw AclCppException("Unimplemented operation: ReduceFOp");
+}
+
+void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
+                                     const std::shared_ptr<nnc::ArtifactId>& output,
+                                     const std::vector<size_t>& mir_perm) {
+
+  // acl 18.8 opencl implementation supports only 3 types of permutation:
+  // in mir (0, 3, 1, 2),  in acl(axes are in reverse order) (1, 2, 0)
+  // in mir (0, 2, 3, 1),  in acl (2, 0, 1)
+  // in mir (2, 3, 1, 0),  in acl (3, 2, 0, 1)
+  // so here we try to transform mir transpose into one acl supports
+
+  const string& out_name = output->name();
+  vector<size_t> acl_perm;
+
+  if (mir_perm == vector<size_t>{0, 3, 1, 2})
+    acl_perm = {1, 2, 0};
+  else if (mir_perm == vector<size_t>{0, 2, 3, 1})
+    acl_perm = {2, 0, 1};
+  else if (mir_perm == vector<size_t>{2, 3, 1, 0})
+    acl_perm = {3, 2, 0, 1};
+  else
+    throw AclCppException("Unsupported transpose sequence in operation " + out_name);
+
+  // Create operation parameter containing permutation vector
+  shared_ptr<ArtifactId> perm_vector =
+      genVectorInitializedVar(_constrBlock, "arm_compute::PermutationVector",
+                              out_name + "_perm_param", acl_perm);
+
+  // Instantiate the CLPermute object.
+  string layer_name = out_name + "_transpose_layer";
+  list<shared_ptr<ArtifactExpr>> arguments = {AF::ref(input), AF::ref(output), perm_vector};
+  auto layer = genLayer("arm_compute::CLPermute", layer_name, arguments);
+  allocate(output);
+  genLayerExecution(layer);
 }
 
 void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) {
-  assert(false && "Unimplemented operation: TransposeOp");
+  auto& prev_nodes = op.getPrevNodes();
+  assert(prev_nodes.size() == 1);
+  auto in_op = prev_nodes[0].op;
+
+  // Get the input node tensor id in the DOM.
+  shared_ptr<ArtifactId> input = AF::id(tensorName(in_op));
+  const vector<size_t>& mir_axis_order = op.getAxisOrder();
+
+  // Create the output tensor in the DOM.
+  if (op.getOutputShape(0).rank() != 4)
+    throw AclCppException("Unsupported number of dimensions in transpose operation");
+  // TODO replace transpose shape
+  shared_ptr<ArtifactId> output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+
+  // Actual generation of operation and related stuff
+  genTranspose(input, output, mir_axis_order);
 }
 
 void AclCppOpGenerator::visit(mir::ops::GatherOp& op) {
-  assert(false && "Unimplemented operation: GatherOp");
+  throw AclCppException("Unimplemented operation: GatherOp");
 }
 
 void AclCppOpGenerator::visit(ops::SigmoidOp& op) {
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h
index c306a2b..b3dc23f 100644
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.h
@@ -82,6 +82,27 @@ private:
   using AF = ArtifactFactory;
 
   /**
+   * @brief generate transpose of input tensor NHWC -> NCHW
+   * @param name name of tensor containing transposed data
+   * @param input_shape shape of @p inpu
+   * @param input id of input tensor
+   * @return Id of result tensor
+   */
+  std::shared_ptr<ArtifactId> genTransposeMIRtoACL(const std::string& name,
+                                                   const mir::Shape& input_shape,
+                                                   const std::shared_ptr<ArtifactId>& input);
+
+  /**
+   * @brief generate transpose NCHW -> NHWC
+   * @param name name of tensor containing transposed data
+   * @param input_shape shape of @p inpu
+   * @param input id of input tensor
+   * @return Id of result tensor
+   */
+  std::shared_ptr<ArtifactId> genTransposeACLtoMIR(const std::string& name,
+                                                   const mir::Shape& input_shape,
+                                                   const std::shared_ptr<ArtifactId>& input);
+  /**
    * @brief The common part of the convolution and the depthwise convolution.
    */
   template <typename Op>
@@ -152,14 +173,17 @@ private:
   std::string tensorName(const mir::Operation* op) const;
 
   /**
-   * @brief Generates tensor shape in DOM.
+   * @brief Generates variables tensor shape in DOM.
    * @param block - DOM block where to create this shape: artifact constructor, inference function.
-   * @param prefix - prefix used for generating the unique name for this shape.
+   * @param name - prefix used for generating the unique name for this shape.
    * @param shape - model IR shape for which we create a DOM analogue.
    * @return - a DOM identifier for the created shape.
    */
-  std::shared_ptr<ArtifactId> genShape(ArtifactBlock* block, const std::string& prefix,
-                                       const mir::Shape& shape);
+  template <typename T>
+  std::shared_ptr<ArtifactId> genVectorInitializedVar(ArtifactBlock* block,
+                                                      const std::string& type,
+                                                      const std::string& name,
+                                                      const std::vector<T>& init);
 
   /**
    * @brief Generates a DOM tensor.
@@ -181,6 +205,16 @@ private:
   std::shared_ptr<ArtifactId> genTensor(mir::Operation& op, const mir::Shape& ir_shape);
 
   /**
+   * @brief generate transposing operation, @p mir_perm contains dimensions in MIR order (batch has index 0)
+   * @param input id of input tensor
+   * @param output id out output tensor
+   * @param mir_perm new order of dimensions
+   */
+   void genTranspose(const std::shared_ptr<nnc::ArtifactId>& input,
+                     const std::shared_ptr<nnc::ArtifactId>& output,
+                     const std::vector<size_t>& mir_perm);
+
+  /**
    * @brief Generates accessors for the input/output tensors.
    */
   void genNamed();
@@ -240,7 +274,7 @@ private:
    * @brief Generate the layer run() call.
    * @param layer_id - layer ID.
    */
-  void runLayer(std::shared_ptr<ArtifactId> layer_id);
+  void genLayerExecution(std::shared_ptr<ArtifactId> layer_id);
 
   /**
    * @brief Input nodes.
diff --git a/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp b/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp
index 2362b1d..0e18453 100644
--- a/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp
+++ b/contrib/nnc/unittests/acl_backend/MIRToDOM.cpp
@@ -297,9 +297,6 @@ TEST(acl_backend_mir_to_dom, conv2d) {
   const ArtifactModule& m = dom_gen.generate(&g);
 
   checkDomStructure(m, {}, {});
-
-  stringstream code_out;
-  ArtifactGeneratorCppCode code_gen(code_out);
 }
 
 TEST(acl_backend_mir_to_dom, depthwise_conv) {
@@ -503,3 +500,31 @@ TEST(acl_backend_mir_to_dom, reshape) {
 TEST(acl_backend_mir_to_dom, DISABLED_pad) {
   // TODO
 }
+
+TEST(acl_backend_mir_to_dom, transpose) {
+  const int32_t channels = 2;
+  TensorVariant w = createTensorVariant({channels});
+
+  vector<size_t> perm{0, 3, 1, 2};
+
+  Graph g;
+  OpConstructor op_generator = [&perm](Graph& g, const vector<IODescriptor>& inputs) {
+      return g.create<mir::ops::TransposeOp>("transpose", inputs[0], perm);
+  };
+  vector<Shape> input_shapes{{1, 10, 10, channels}};
+
+  fillGraph(g, op_generator, input_shapes);
+
+  stringstream params_out;
+  AclCppOpGenerator dom_gen(artifactName, params_out);
+
+  const ArtifactModule& m = dom_gen.generate(&g);
+
+  checkDomStructure(m, {}, {});
+
+  ArtifactGeneratorCppCode code_gen(std::cerr);
+  m.accept(&code_gen);
+  ArtifactGeneratorCppDecl decl_gen(std::cerr);
+  m.accept(&decl_gen);
+
+}
-- 
2.7.4