[nnc] remove redundant shape transposes from acl backend (#2764)

author Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>

Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)

committer GitHub Enterprise <noreply-CODE@samsung.com>

Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
author Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
committer GitHub Enterprise <noreply-CODE@samsung.com>
Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp

index b0bb638..1a159e4 100644 (file)
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
@@ -122,7 +122,7 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
      axis_name = axis_names[axis];
    }
  
-  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  auto out = genTensor(op, op.getOutputShape(0));
    auto prefix = out->name() + "_concatenate_layer";
    auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
    auto inputs = inputs_var->use();
@@ -155,7 +155,7 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
    // CLPermute does not support all kinds of permutations now.
    // rank can be more than 2 in our models, so we can not use CLTranspose.
    // This means we can support tensors with no more then one axis > 1.
-  int axis = op.getAxis() < 0 ? rank + op.getAxis() : op.getAxis();
+  int axis = op.getAxis();
    assert(axis == rank - 1);
    int nof_long_axes = 0;
  
@@ -169,62 +169,48 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
      throw AclCppException("Unsupported Softmax operation with several dimensions greater than 1");
  
    // Create the output tensor.
-  Shape in_out_shape(op.getOutputShape(0));
-  Shape sm_shape(in_out_shape);
-
-  if (axis != 0) {
-    int sm_dim = sm_shape.dim(axis);
-    sm_shape.dim(axis) = sm_shape.dim(0);
-    sm_shape.dim(0) = sm_dim;
-  }
-
-  Shape transposed_out_shape;
-
-  switch (in_out_shape.rank()) {
-    case 4:
-      transposed_out_shape = transposeShape<3, 2, 1, 0>(in_out_shape);
-      break;
-    case 2:
-      transposed_out_shape = transposeShape<1, 0>(in_out_shape);
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions in softmax");
-  }
+  const Shape& in_out_shape = op.getOutputShape(0);
  
-  auto out = genTensor(op, transposed_out_shape);
-  auto prefix = out->name();
+  shared_ptr<ArtifactId> output = genTensor(op, in_out_shape);
+  auto layer_name_prefix = output->name();
  
    if (axis == 0) {
      // Simple version: do not need pre and post reshapes.
-    // Apply the softmax operaion.
-    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
-                       {AF::ref(in), AF::ref(out)});
-    allocate(out);
+    // Apply the softmax operation.
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
+                       {AF::ref(in), AF::ref(output)});
+    allocate(output);
      genLayerExecution(sm);
    } else {
+    // TODO refactor this code, it works only with 1 batch
+
      // Need to reshape before the Softmax application and after it.
      // Then we need two tensors for intermediate results. This is because we do a couple of auxiliary
      // reshapes: one to transform the input tensor to a unidimensional tensor and the second to
      // transorm the result of the softmax operation back to the original form.
-    auto tmp = genTensor(prefix + "_tmp", sm_shape);
-    auto tmp2 = genTensor(prefix + "_tmp2", sm_shape);
+    Shape sm_shape(in_out_shape);
+
+    std::swap(sm_shape.dim(axis), sm_shape.dim(-1));
+
+    auto tmp = genTensor(layer_name_prefix + "_tmp", sm_shape);
+    auto tmp2 = genTensor(layer_name_prefix + "_tmp2", sm_shape);
  
      // Do the input permutation.
-    auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1",
+    auto transp1 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer1",
                            {AF::ref(in), AF::ref(tmp)});
      allocate(tmp);
      genLayerExecution(transp1);
  
      // Apply the softmax operaion.
-    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", layer_name_prefix + "_softmax_layer",
                         {AF::ref(tmp), AF::ref(tmp2)});
      allocate(tmp2);
      genLayerExecution(sm);
  
      // Reshape the output to the original form.
-    auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2",
-                          {AF::ref(tmp2), AF::ref(out)});
-    allocate(out);
+    auto transp2 = genLayer("arm_compute::CLReshapeLayer", layer_name_prefix + "_transp_layer2",
+                          {AF::ref(tmp2), AF::ref(output)});
+    allocate(output);
      genLayerExecution(transp2);
    }
  }
@@ -277,7 +263,7 @@ AclCppOpGenerator::genTransposeMIRtoACL(const string& name,
      _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
      return output;
    }
-  Shape transposed_shape = transposeShape<2, 1, 3, 0>(input_shape);
+  Shape transposed_shape = transposeShape<0, 3, 1, 2>(input_shape);
    shared_ptr<ArtifactId> transposed_id =
        genTensor(name, transposed_shape, false);
    genTranspose(input, transposed_id, {0, 3, 1, 2});
@@ -296,7 +282,7 @@ AclCppOpGenerator::genTransposeACLtoMIR(const string& name,
      _constrBlock->var("arm_compute::CLTensor&", output->name(), {}, {input});
      return output;
    }
-  Shape transposed_shape = transposeShape<1, 3, 2, 0>(input_shape);
+  Shape transposed_shape = transposeShape<0, 2, 3, 1>(input_shape);
    shared_ptr<ArtifactId> transposed_id =
        genTensor(name, transposed_shape, false);
    genTranspose(input, transposed_id, {0, 2, 3, 1});
@@ -355,7 +341,7 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
    // Generate auxiliary tensor to hold transposed output of pool in NCHW format
    Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
    shared_ptr<ArtifactId> transposed_output =
-      genTensor(layer_name + "_out_transpose", transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(layer_name + "_out_transpose", transposed_output_shape);
  
    // Actual layer creation
    shared_ptr<ArtifactId> layer = genLayer("arm_compute::CLPoolingLayer", layer_name,
@@ -371,7 +357,7 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
  }
  
  void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
-  const TensorVariant& ir_weights = op.getWeights();
+  const TensorVariant ir_weights = transposeTensor<1, 0>(op.getWeights());
    const Shape& ir_weights_shape = ir_weights.getShape();
  
    auto& prev_nodes = op.getPrevNodes();
@@ -385,7 +371,7 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
    const Shape& out_shape = op.getOutputShape(0);
    if (out_shape.rank() != 2)
      throw AclCppException("Unsupported number of dimensions in fc layer");
-  auto out = genTensor(op, transposeShape<1, 0>(out_shape));
+  auto out = genTensor(op, out_shape);
    string operation_name = out->name() + "_fully_connected_layer";
  
    // Create the weights tensor in the DOM and use its id.
@@ -438,14 +424,10 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
        transposed_input = genTransposeMIRtoACL(transposed_input_name, op.getInputShape(0), input);
  
        transposed_output =
-          genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+          genTensor(transposed_output_name, transposed_output_shape);
        break;
      }
      case 2:
-      transposed_output_shape = out_shape;
-      transposed_input = input;
-      transposed_output = genTensor(tensorName(&op), transposeShape<1, 0>(transposed_output_shape));
-      break;
      case 1:
        transposed_output_shape = out_shape;
        transposed_input = input;
@@ -462,18 +444,11 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
    const auto ir_input_shape = op.getInputShape(0);
    Shape ir_biases_shape(ir_input_shape.rank());
  
-  // TODO remove this if after batch axis is restored in all operations in Model IR
-  if (op.getPrevNodes()[0].op->getType() == Operation::Type::fullyConnected) {
-    // Fully connected layer restores batch axis in result, so need to copy shape with redundant 1
-    // Shape transpose is needed to generate axises in reverse order
-    ir_biases_shape = transposeShape<1, 0>(op.getInputShape(0));
-  } else {
-    // ACL CLArithmeticAddition supports input tensors broadcasting.
-    for (int i = 0; i < ir_input_shape.rank(); ++i)
-      ir_biases_shape.dim(i) = 1;
+  // ACL CLArithmeticAddition supports input tensors broadcasting.
+  for (int i = 0; i < ir_input_shape.rank(); ++i)
+    ir_biases_shape.dim(i) = 1;
  
-    ir_biases_shape.dim(2) = ir_biases.getShape().dim(0);
-  }
+  ir_biases_shape.dim(1) = ir_biases.getShape().dim(0);
    auto biases = genTensor(layer_name + "_biases", ir_biases_shape);
  
    // Instantiate the CLArithmeticAddition object.
@@ -499,15 +474,12 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
  void AclCppOpGenerator::visit(ops::VariableOp& op) {
    shared_ptr<ArtifactId> tensor;
    if (cli::debugTranspose) {
-    if (op.getOutputShape(0).rank() == 2)
-      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
-    else
-      tensor = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+    tensor = genTensor(op, op.getOutputShape(0));
    } else {
-    if (op.getOutputShape(0).rank() == 2)
-      tensor = genTensor(op, transposeShape<1, 0>(op.getOutputShape(0)));
+    if (op.getOutputShape(0).rank() == 4)
+      tensor = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
      else
-      tensor = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+      tensor = genTensor(op, op.getOutputShape(0));
    }
    allocate(tensor);
  }
@@ -516,22 +488,7 @@ void AclCppOpGenerator::visit(ops::ConstantOp& op) {
    Shape out_shape = op.getOutputShape(0);
    TensorVariant data = op.getValue();
  
-  Shape transposed_shape;
-  // FIXME This is temporary solution,
-  // need to move this shape transposes into genTensor function and
-  // implement transpose operation to support ranks greater than 2
-  switch (out_shape.rank()) {
-    case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
-      break;
-    case 1:
-      transposed_shape = out_shape;
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
-  }
-
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+  shared_ptr<ArtifactId> out = genTensor(op, out_shape);
  
    allocate(out);
    // Serialize the weights tensor and generate the function to deserialize it in the artifact.
@@ -564,18 +521,7 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
      }
    }
  
-  Shape transposed_shape;
-  switch (out_shape.rank()) {
-    case 2:
-      transposed_shape = transposeShape<1, 0>(out_shape);
-      break;
-    case 1:
-      transposed_shape = out_shape;
-      break;
-    default:
-      throw AclCppException("Unsupported number of dimensions: " + to_string(out_shape.rank()));
-  }
-  shared_ptr<ArtifactId> out = genTensor(op, transposed_shape);
+  shared_ptr<ArtifactId> out = genTensor(op, out_shape);
  
    // Create an instance of the CLReshapeLayer class as a member of the artifact class.
    auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
@@ -609,8 +555,6 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
        transposed_output_shape = transposeShape<0, 3, 1, 2>(out_shape);
        break;
      case 2:
-      transposed_output_shape = transposeShape<1, 0>(out_shape);
-      break;
      case 1:
        transposed_output_shape = out_shape;
        break;
@@ -620,21 +564,21 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
  
    const string transposed_output_name = output_tensor_name + "_transposed_output";
    shared_ptr<ArtifactId> transposed_output =
-      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(transposed_output_name, transposed_output_shape);
  
    auto operation_name = transposed_output->name() + "_scale_layer";
  
    const auto& ir_scales = op.getWeights();
  
    // Reshape the IR scales tensor and generate the corresponding DOM tensor.
-  const Shape ir_input_shape = transposeShape<2, 1, 3, 0>(op.getInputShape(0));
+  const Shape ir_input_shape = transposeShape<0, 3, 1, 2>(op.getInputShape(0));
    Shape ir_scales_shape(ir_input_shape.rank());
  
    // ACL CLArithmeticDivision supports input tensors broadcasting.
    for (int i = 0; i < ir_input_shape.rank(); ++i)
      ir_scales_shape.dim(i) = 1;
  
-  ir_scales_shape.dim(2) = ir_scales.getShape().dim(0);
+  ir_scales_shape.dim(1) = ir_scales.getShape().dim(0);
    auto scales = genTensor(operation_name + "_scales", ir_scales_shape);
  
    // We do not use the genMultiplication() function here because the input needs broadcasting.
@@ -703,7 +647,7 @@ void AclCppOpGenerator::visit(ops::TanhOp& op) {
  
  void AclCppOpGenerator::visit(ops::ElementwiseOp& op) {
    // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  auto out = genTensor(op, op.getOutputShape(0));
  
    auto& prev_nodes = op.getPrevNodes();
    assert(prev_nodes.size() >= 2);
@@ -751,10 +695,8 @@ void AclCppOpGenerator::visit(ops::PadOp& op) {
  
  template <typename Op>
  void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, const string& suffix) {
-  auto ir_weights = transposeTensor<1, 0, 2, 3>(op.getKernel());
+  auto ir_weights = transposeTensor<3, 2, 0, 1>(op.getKernel());
    const auto& ir_weights_shape = ir_weights.getShape();
-  assert(ir_weights_shape.rank() == 4);
-  Shape ir_biases_shape({ir_weights_shape.dim(-1)});
  
    auto& prev_nodes = op.getPrevNodes();
    assert(prev_nodes.size() == 1);
@@ -774,7 +716,7 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
    const string transposed_output_name = output_tensor_name + "_transposed_output";
    Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(op.getOutputShape(0));
    shared_ptr<ArtifactId> transposed_output =
-      genTensor(transposed_output_name, transposeShape<3, 2, 1, 0>(transposed_output_shape));
+      genTensor(transposed_output_name, transposed_output_shape);
  
    string operation_name = output_tensor_name + suffix;
  
@@ -828,9 +770,9 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
    // Create the output tensor in the DOM and return its id.
    shared_ptr<ArtifactId> output;
    if (cli::debugTranspose)
-    output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+    output = genTensor(op, op.getOutputShape(0));
    else
-    output = genTensor(op, transposeShape<2, 1, 3, 0>(op.getOutputShape(0)));
+    output = genTensor(op, transposeShape<0, 3, 1, 2>(op.getOutputShape(0)));
  
    auto prefix = output->name() + "_activation_layer";
  
@@ -971,7 +913,7 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name,
      // create vector of initializers from Shape
      shape_vectorized.reserve(ir_shape.rank());
      for (int i = 0; i < ir_shape.rank(); ++i)
-      shape_vectorized.push_back(ir_shape.dim(i));
+      shape_vectorized.push_back(ir_shape.dim(-i - 1));
  
      const char* type_name = "arm_compute::TensorShape";
      shared_ptr<ArtifactId> shape =
@@ -1032,28 +974,8 @@ void AclCppOpGenerator::serializeIRTensor(const TensorVariant& tensor) {
      dimensions.at(i) = shape.dim(i);
    }
  
-  for (;;) {
-    float v;
-    memcpy(&v, tensor.at(coords), tensor.getElementSize());
-    _parOut.write(tensor.at(coords), tensor.getElementSize());
-    bool stop = true;
-    int i;
-
-    for (i = 0; i < shape.rank(); ++i) {
-      if(coords.at(i) < dimensions.at(i) - 1) {
-        ++coords.at(i);
-        stop = false;
-        break;
-      }
-    }
-
-    if (stop) {
-      break;
-    } else {
-      for (int j = 0; j < i; ++j)
-        coords.at(j) = 0;
-    }
-  }
+  size_t data_size = tensor.getElementSize() * tensor.getShape().numElements();
+  _parOut.write(tensor.atOffset(0), data_size);
  }
  
  void AclCppOpGenerator::genDeserializations() {
@@ -1156,7 +1078,7 @@ void AclCppOpGenerator::visit(mir::ops::TransposeOp& op) {
    if (op.getOutputShape(0).rank() != 4)
      throw AclCppException("Unsupported number of dimensions in transpose operation");
    // TODO replace transpose shape
-  shared_ptr<ArtifactId> output = genTensor(op, transposeShape<3, 2, 1, 0>(op.getOutputShape(0)));
+  shared_ptr<ArtifactId> output = genTensor(op, op.getOutputShape(0));
  
    // Actual generation of operation and related stuff
    genTranspose(input, output, mir_axis_order);
author	Efimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
	Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)
committer	GitHub Enterprise <noreply-CODE@samsung.com>
	Thu, 10 Jan 2019 10:09:42 +0000 (13:09 +0300)