[nnc] remove redundant shape transposes from acl backend (#2764)
authorEfimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
committerGitHub Enterprise <noreply-CODE@samsung.com>
Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
- Remove redundant reverse shape transposes like this [0,1,2,3]->[3,2,1,0]
- Refactor tensor serialization

Signed-off-by: Efimov Alexander <a.efimov@samsung.com>
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp

index b0bb638..1a159e4 100644 (file)
@@ -122,7 +122,7 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
     axis_name = axis_names[axis];
   }
 
-  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  auto out = genTensor(op, op.getOutputShape(0));
   auto prefix = out->name() + "_concatenate_layer";
   auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
   auto inputs = inputs_var->use();
@@ -155,7 +155,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
   // CLPermute does not support all kinds of permutations now.
   // rank can be more than 2 in our models, so we can not use CLTranspose.
   // This means we can support tensors with no more then one axis > 1.
-  int axis = op.getAxis() < 0 ? rank + op.getAxis() : op.getAxis();
+  int axis = op.getAxis();
   assert(axis == rank - 1);
   int nof_long_axes = 0;
 
@@ -169,62 +169,48 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
     throw AclCppException("Unsupported Softmax operation with several dimensions greater than 1");
 
   // Create the output tensor.
-  Shape in_out_shape(op.getOutputShape(0));
-  Shape sm_shape(in_out_shape);
-
-  if (axis != 0) {
-    int sm_dim = sm_shape.dim(axis);
-    sm_shape.dim(axis) = sm_shape.dim(0);
-    sm_shape.dim(0) = sm_dim;
-  }
-
-  Shape transposed_out_shape;
-
-  switch (in_out_shape.rank()) {
-    case 4:
-      transposed_out_shape = transposeShape<3, 2, 1, 0>(in_out_shape);
-      break;
-    case 2:
-      transposed_out_shape = transposeShape<1, 0>(in_out_shape);
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions in softmax");
-  }
+  const Shape& in_out_shape = op.getOutputShape(0);
 
-  auto out = genTensor(op, transposed_out_shape);
-  auto prefix = out->name();
+  shared_ptr<ArtifactId> output = genTensor(op, in_out_shape);
+  auto layer_name_prefix = output->name();
 
   if (axis == 0) {
     // Simple version: do not need pre and post reshapes.
-    // Apply the softmax operaion.
-    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
-                       {AF::ref(in), AF::ref(out)});
-    allocate(out);
+    // Apply the softmax operation.
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
+                       {AF::ref(in), AF::ref(output)});
+    allocate(output);
     genLayerExecution(sm);
   } else {
+    // TODO refactor this code, it works only with 1 batch
+
     // Need to reshape before the Softmax application and after it.
     // Then we need two tensors for intermediate results. This is because we do a couple of auxiliary
     // reshapes: one to transform the input tensor to a unidimensional tensor and the second to
     // transorm the result of the softmax operation back to the original form.
-    auto tmp = genTensor(prefix + "_tmp", sm_shape);
-    auto tmp2 = genTensor(prefix + "_tmp2", sm_shape);
+    Shape sm_shape(in_out_shape);
+
+    std::swap(sm_shape.dim(axis), sm_shape.dim(-1));
+
+    auto tmp = genTensor(layer_name_prefix + "_tmp", sm_shape);
+    auto tmp2 = genTensor(layer_name_prefix + "_tmp2", sm_shape);
 
     // Do the input permutation.
-    auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1",
+    auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
                           {AF::ref(in), AF::ref(tmp)});
     allocate(tmp);
     genLayerExecution(transp1);
 
     // Apply the softmax operaion.
-    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                        {AF::ref(tmp), AF::ref(tmp2)});
     allocate(tmp2);
     genLayerExecution(sm);
 
     // Reshape the output to the original form.
-    auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2",
-                          {AF::ref(tmp2), AF::ref(out)});
-    allocate(out);
+    auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
+                          {AF::ref(tmp2), AF::ref(output)});
+    allocate(output);
     genLayerExecution(transp2);
   }
 }
@@ -277,7 +263,7 @@ AclCppOpGenerator::genTransposeMIRtoACL(const string& name,
     _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
     return output;
   }
-  Shape transposed_shape = transposeShape<2, 1, 3, 0>(input_shape);
+  Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
   shared_ptr<ArtifactId> transposed_id =
       genTensor(name, transposed_shape, false);
   genTranspose(input, transposed_id, {0, 3, 1, 2});
@@ -296,7 +282,7 @@ AclCppOpGenerator::genTransposeACLtoMIR(const string& name,
     _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
     return output;
   }
-  Shape transposed_shape = transposeShape<1, 3, 2, 0>(input_shape);
+  Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
   shared_ptr<ArtifactId> transposed_id =
       genTensor(name, transposed_shape, false);
   genTranspose(input, transposed_id, {0, 2, 3, 1});
@@ -355,7 +341,7 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
   // Generate auxiliary tensor to hold transposed output of pool in NCHW format
   Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
   shared_ptr<ArtifactId> transposed_output =
-      genTensor(layer_name + "_out_transpose", transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(layer_name + "_out_transpose", transposed_output_shape);
 
   // Actual layer creation
   shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
@@ -371,7 +357,7 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
 }
 
 void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
-  const TensorVariant& ir_weights = op.getWeights();
+  const TensorVariant ir_weights = transposeTensor<1, 0>(op.getWeights());
   const Shape& ir_weights_shape = ir_weights.getShape();
 
   auto& prev_nodes = op.getPrevNodes();
@@ -385,7 +371,7 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
   const Shape& out_shape = op.getOutputShape(0);
   if (out_shape.rank() != 2)
     throw AclCppException("Unsupported number of dimensions in fc layer");
-  auto out = genTensor(op, transposeShape<1, 0>(out_shape));
+  auto out = genTensor(op, out_shape);
   string operation_name = out->name() + "_fully_connected_layer";
 
   // Create the weights tensor in the DOM and use its id.
@@ -438,14 +424,10 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
       transposed_input = genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input);
 
       transposed_output =
-          genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+          genTensor(transposed_output_name, transposed_output_shape);
       break;
     }
     case 2:
-      transposed_output_shape = out_shape;
-      transposed_input = input;
-      transposed_output = genTensor(tensorName(&op), transposeShape<1, 0>(transposed_output_shape));
-      break;
     case 1:
       transposed_output_shape = out_shape;
       transposed_input = input;
@@ -462,18 +444,11 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
   const auto ir_input_shape = op.getInputShape(0);
   Shape ir_biases_shape(ir_input_shape.rank());
 
-  // TODO remove this if after batch axis is restored in all operations in Model IR
-  if (op.getPrevNodes()[0].op->getType() == Operation::Type::fullyConnected) {
-    // Fully connected layer restores batch axis in result, so need to copy shape with redundant 1
-    // Shape transpose is needed to generate axises in reverse order
-    ir_biases_shape = transposeShape<1, 0>(op.getInputShape(0));
-  } else {
-    // ACL CLArithmeticAddition supports input tensors broadcasting.
-    for (int i = 0; i < ir_input_shape.rank(); ++i)
-      ir_biases_shape.dim(i) = 1;
+  // ACL CLArithmeticAddition supports input tensors broadcasting.
+  for (int i = 0; i < ir_input_shape.rank(); ++i)
+    ir_biases_shape.dim(i) = 1;
 
-    ir_biases_shape.dim(2) = ir_biases.getShape().dim(0);
-  }
+  ir_biases_shape.dim(1) = ir_biases.getShape().dim(0);
   auto biases = genTensor(layer_name + "_biases", ir_biases_shape);
 
   // Instantiate the CLArithmeticAddition object.
@@ -499,15 +474,12 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
 void AclCppOpGenerator::visit(ops::VariableOp& op) {
   shared_ptr<ArtifactId> tensor;
   if (cli::debugTranspose) {
-    if (op.getOutputShape(0).rank() == 2)
-      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
-    else
-      tensor = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+    tensor = genTensor(op, op.getOutputShape(0));
   } else {
-    if (op.getOutputShape(0).rank() == 2)
-      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+    if (op.getOutputShape(0).rank() == 4)
+      tensor = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
     else
-      tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+      tensor = genTensor(op, op.getOutputShape(0));
   }
   allocate(tensor);
 }
@@ -516,22 +488,7 @@ void AclCppOpGenerator::visit(ops::ConstantOp& op) {
   Shape out_shape = op.getOutputShape(0);
   TensorVariant data = op.getValue();
 
-  Shape transposed_shape;
-  // FIXME This is temporary solution,
-  // need to move this shape transposes into genTensor function and
-  // implement transpose operation to support ranks greater than 2
-  switch (out_shape.rank()) {
-    case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
-      break;
-    case 1:
-      transposed_shape = out_shape;
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
-  }
-
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+  shared_ptr<ArtifactId> out = genTensor(op, out_shape);
 
   allocate(out);
   // Serialize the weights tensor and generate the function to deserialize it in the artifact.
@@ -564,18 +521,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
     }
   }
 
-  Shape transposed_shape;
-  switch (out_shape.rank()) {
-    case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
-      break;
-    case 1:
-      transposed_shape = out_shape;
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
-  }
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+  shared_ptr<ArtifactId> out = genTensor(op, out_shape);
 
   // Create an instance of the CLReshapeLayer class as a member of the artifact class.
   auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
@@ -609,8 +555,6 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
       transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape);
       break;
     case 2:
-      transposed_output_shape = transposeShape<1, 0>(out_shape);
-      break;
     case 1:
       transposed_output_shape = out_shape;
       break;
@@ -620,21 +564,21 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
 
   const string transposed_output_name = output_tensor_name + "_transposed_output";
   shared_ptr<ArtifactId> transposed_output =
-      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(transposed_output_name, transposed_output_shape);
 
   auto operation_name = transposed_output->name() + "_scale_layer";
 
   const auto& ir_scales = op.getWeights();
 
   // Reshape the IR scales tensor and generate the corresponding DOM tensor.
-  const Shape ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0));
+  const Shape ir_input_shape = transposeShape<0, 3, 1, 2>(op.getInputShape(0));
   Shape ir_scales_shape(ir_input_shape.rank());
 
   // ACL CLArithmeticDivision supports input tensors broadcasting.
   for (int i = 0; i < ir_input_shape.rank(); ++i)
     ir_scales_shape.dim(i) = 1;
 
-  ir_scales_shape.dim(2) = ir_scales.getShape().dim(0);
+  ir_scales_shape.dim(1) = ir_scales.getShape().dim(0);
   auto scales = genTensor(operation_name + "_scales", ir_scales_shape);
 
   // We do not use the genMultiplication() function here because the input needs broadcasting.
@@ -703,7 +647,7 @@ void AclCppOpGenerator::visit(ops::TanhOp& op) {
 
 void AclCppOpGenerator::visit(ops::ElementwiseOp& op) {
   // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  auto out = genTensor(op, op.getOutputShape(0));
 
   auto& prev_nodes = op.getPrevNodes();
   assert(prev_nodes.size() >= 2);
@@ -751,10 +695,8 @@ void AclCppOpGenerator::visit(ops::PadOp& op) {
 
 template <typename Op>
 void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
-  auto ir_weights = transposeTensor<1, 0, 2, 3>(op.getKernel());
+  auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
   const auto& ir_weights_shape = ir_weights.getShape();
-  assert(ir_weights_shape.rank() == 4);
-  Shape ir_biases_shape({ir_weights_shape.dim(-1)});
 
   auto& prev_nodes = op.getPrevNodes();
   assert(prev_nodes.size() == 1);
@@ -774,7 +716,7 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
   const string transposed_output_name = output_tensor_name + "_transposed_output";
   Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
   shared_ptr<ArtifactId> transposed_output =
-      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(transposed_output_name, transposed_output_shape);
 
   string operation_name = output_tensor_name + suffix;
 
@@ -828,9 +770,9 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
   // Create the output tensor in the DOM and return its id.
   shared_ptr<ArtifactId> output;
   if (cli::debugTranspose)
-    output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+    output = genTensor(op, op.getOutputShape(0));
   else
-    output = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+    output = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
 
   auto prefix = output->name() + "_activation_layer";
 
@@ -971,7 +913,7 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name,
     // create vector of initializers from Shape
     shape_vectorized.reserve(ir_shape.rank());
     for (int i = 0; i < ir_shape.rank(); ++i)
-      shape_vectorized.push_back(ir_shape.dim(i));
+      shape_vectorized.push_back(ir_shape.dim(-i - 1));
 
     const char* type_name = "arm_compute::TensorShape";
     shared_ptr<ArtifactId> shape =
@@ -1032,28 +974,8 @@ void AclCppOpGenerator::serializeIRTensor(const TensorVariant& tensor) {
     dimensions.at(i) = shape.dim(i);
   }
 
-  for (;;) {
-    float v;
-    memcpy(&v, tensor.at(coords), tensor.getElementSize());
-    _parOut.write(tensor.at(coords), tensor.getElementSize());
-    bool stop = true;
-    int i;
-
-    for (i = 0; i < shape.rank(); ++i) {
-      if(coords.at(i) < dimensions.at(i) - 1) {
-        ++coords.at(i);
-        stop = false;
-        break;
-      }
-    }
-
-    if (stop) {
-      break;
-    } else {
-      for (int j = 0; j < i; ++j)
-        coords.at(j) = 0;
-    }
-  }
+  size_t data_size = tensor.getElementSize() * tensor.getShape().numElements();
+  _parOut.write(tensor.atOffset(0), data_size);
 }
 
 void AclCppOpGenerator::genDeserializations() {
@@ -1156,7 +1078,7 @@ void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) {
   if (op.getOutputShape(0).rank() != 4)
     throw AclCppException("Unsupported number of dimensions in transpose operation");
   // TODO replace transpose shape
-  shared_ptr<ArtifactId> output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
 
   // Actual generation of operation and related stuff
   genTranspose(input, output, mir_axis_order);