From ea5a0925d4573128324bc9c69af4b4a19510ed9b Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=A2=D0=B8=D0=BC=D1=83=D1=80=20=D0=9E=D1=82=D0=B5=D0=BB?=
 =?utf8?q?=D0=BB=D0=BE=D0=B2=D0=B8=D1=87=20=D0=90=D0=B1=D0=BB=D1=8F=D0=B7?=
 =?utf8?q?=D0=B8=D0=BC=D0=BE=D0=B2/AI=20Tools=20Lab=20/SRR/Staff=20Enginee?=
 =?utf8?q?r/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <t.ablyazimov@samsung.com>
Date: Tue, 13 Nov 2018 19:27:02 +0300
Subject: [PATCH] [nnc] Problems fixed during the debug of the Convolution and
 the Softmax operations (#2142)

- Tensor and tensor shapes transposed in certain places.
- Serialization operation changed to save tensors starting from lower to higher dimensions (to be easily readable by the standard ACL routine).

Signed-off-by: Timur Ablyazimov <t.ablyazimov@samsung.com>
---
 contrib/nnc/include/core/modelIR/TensorUtil.h      |  6 ++
 .../passes/acl_soft_backend/AclCppOpGenerator.h    |  7 +-
 .../acl_soft_backend/AclArtifactUtilities.in       |  2 +-
 .../passes/acl_soft_backend/AclCppOpGenerator.cpp  | 74 +++++++++++++++-------
 4 files changed, 59 insertions(+), 30 deletions(-)
diff --git a/contrib/nnc/include/core/modelIR/TensorUtil.h b/contrib/nnc/include/core/modelIR/TensorUtil.h
index b49890e..dfbe81c 100644
--- a/contrib/nnc/include/core/modelIR/TensorUtil.h
+++ b/contrib/nnc/include/core/modelIR/TensorUtil.h
@@ -31,6 +31,12 @@ namespace mir
 {
 
 template<unsigned int... Ints>
+Shape transposeShape(const Shape& shape) {
+  Shape result{shape.dim(Ints)...};
+  return result;
+}
+
+template<unsigned int... Ints>
 static std::shared_ptr <TensorVariant>
 transposeTensor(std::shared_ptr <const TensorVariant> tensor)
 {
diff --git a/contrib/nnc/include/passes/acl_soft_backend/AclCppOpGenerator.h b/contrib/nnc/include/passes/acl_soft_backend/AclCppOpGenerator.h
index 216f246..1b23d14 100644
--- a/contrib/nnc/include/passes/acl_soft_backend/AclCppOpGenerator.h
+++ b/contrib/nnc/include/passes/acl_soft_backend/AclCppOpGenerator.h
@@ -166,10 +166,10 @@ private:
   /**
    * @brief Generates a DOM tensor.
    * @param node - node for which this tensor generated.
-   * @param op - an IR operation for which this tensor is generated.
+   * @param ir_shape - a shape in IR.
    * @return - a DOM identifier for the created tensor.
    */
-  std::shared_ptr<ArtifactId> genTensor(mir::INode* node, mir::OpDescription& op);
+  std::shared_ptr<ArtifactId> genTensor(mir::INode* node, const mir::Shape& ir_shape);
 
   /**
    * @brief Generates accessors for the input/output tensors.
@@ -178,11 +178,8 @@ private:
 
   /**
    * @brief Serializes a tensor.
-   * @tparam Ints - transposes to use during serialization. Needed because ACL can use different
-   *                tensor layouts than the model IR.
    * @param tensor - tensor to serialize.
    */
-  template<unsigned int... Ints>
   void serializeTensor(const mir::TensorVariant& tensor);
 
   std::set<mir::INode*> _inputs;
diff --git a/contrib/nnc/passes/acl_soft_backend/AclArtifactUtilities.in b/contrib/nnc/passes/acl_soft_backend/AclArtifactUtilities.in
index c51a00d..e68bd48 100644
--- a/contrib/nnc/passes/acl_soft_backend/AclArtifactUtilities.in
+++ b/contrib/nnc/passes/acl_soft_backend/AclArtifactUtilities.in
@@ -24,7 +24,7 @@ static void deserializeTensor(std::istream& par_in, arm_compute::CLTensor& tenso
   window.use_tensor_dimensions(tensor.info()->tensor_shape());
   arm_compute::Iterator iter(&tensor, window);
   arm_compute::execute_window_loop(window, [&par_in, &iter](const arm_compute::Coordinates&) {
-      par_in.read(iter.ptr(), sizeof(float));
+      par_in.read(reinterpret_cast<char*>(iter.ptr()), sizeof(float));
     }, iter);
 
   tensor.unmap();
diff --git a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
index 2d5ca59..6f4774e 100644
--- a/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
+++ b/contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
@@ -70,7 +70,7 @@ void AclCppOpGenerator::visit(INode* node, ops::ConcatOp& op) {
                                      "arm_compute::DataLayoutDimension::BATCHES"};
 
   assert(op.getAxis() < sizeof(axis_names) / sizeof(const char*));
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   auto prefix = out->name() + "_concatenate_layer";
   auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
   auto inputs = inputs_var->use();
@@ -99,7 +99,7 @@ void AclCppOpGenerator::visit(INode* node, ops::SoftmaxOp& op) {
   assert(prev_nodes.size() == 1);
   auto in_node = prev_nodes[0].node;
   auto in = AF::id(tensorName(in_node));
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   auto sm_layer_var = _artifactClass->var(false, "arm_compute::CLSoftmaxLayer",
                                           out->name() + "_softmax_layer");
   auto sm_layer = sm_layer_var->use();
@@ -126,7 +126,7 @@ void AclCppOpGenerator::visit(INode* node, ops::PoolOp& op) {
 
   auto in_node = prev_nodes[0].node;
   auto in = AF::id(tensorName(in_node));
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   auto prefix = out->name() + "_pooling_layer";
 
   auto pad_stride_info_var = _constrBlock->var("arm_compute::PadStrideInfo",
@@ -163,14 +163,14 @@ void AclCppOpGenerator::visit(INode* node, ops::FullyConnectedOp& op) {
   auto in = AF::id(tensorName(in_node));
 
   // Create the output tensor in the DOM.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   string operation_name = out->name() + "_fully_connected_layer";
 
   // Create the weights tensor in the DOM and use its id.
   auto weights = genTensor(operation_name + "_weights", ir_weights_shape);
 
   // Serialize the weights tensor and generate the function to deserialize it in the artifact.
-  serializeTensor<3, 2, 1, 0>(ir_weights);
+  serializeTensor(ir_weights);
   _constrBlock->call("deserializeTensor", {_parIn, weights});
 
   // Instantiate the CLFullyConnectedLayer object.
@@ -202,7 +202,7 @@ void AclCppOpGenerator::visit(INode* node, ops::BiasAddOp& op) {
   auto in = AF::id(tensorName(in_node));
 
   // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
 
   // Prefix used for the name of variables related to the operation implementation.
   string operation_name = out->name() + "_bias_add_layer";
@@ -237,7 +237,9 @@ void AclCppOpGenerator::visit(INode* node, ops::BiasAddOp& op) {
 }
 
 void AclCppOpGenerator::visit(INode* node, ops::VariableOp& op) {
-  genTensor(node, op);
+  // Axes order is HWC in the Model IR and WHC in the ACL library, so we are switching the first
+  // two dimensions.
+  genTensor(node, transposeShape<1, 0, 2>(op.getOutputShape(0)));
 }
 
 void AclCppOpGenerator::visit(INode* node, ops::ReluOp& op) {
@@ -253,7 +255,7 @@ void AclCppOpGenerator::visit(INode* node, ops::ReshapeOp& op) {
   auto in = AF::id(tensorName(in_node));
 
   // Create the output tensor in the DOM and return its id.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
 
   // Create an instance of the CLReshapeLayer class as a member of the artifact class.
   auto reshape_layer_var = _artifactClass->var(false, "arm_compute::CLReshapeLayer",
@@ -278,7 +280,7 @@ void AclCppOpGenerator::visit(INode* node, ops::ScaleOp& op) {
   auto in = AF::id(tensorName(in_node));
 
   // Generate output tensor description in the DOM.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   auto prefix = out->name() + "_scale_layer";
 
   // Create a CLPixelWiseMultiplication instance.
@@ -330,7 +332,7 @@ void AclCppOpGenerator::visit(INode* node, ops::DropoutOp& op) {
   auto in = AF::id(tensorName(in_node));
 
   // Generate output tensor description in the DOM.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
 
   // Create a CLCopy instance.
   auto copy_layer_var = _artifactClass->var(false, "arm_compute::CLCopy",
@@ -350,7 +352,7 @@ void AclCppOpGenerator::visit(INode* node, ops::TanhOp& op) {
 
 void AclCppOpGenerator::visit(INode* node, ops::ElementwiseOp& op) {
   // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
 
   auto& prev_nodes = node->getPrevNodes();
   assert(prev_nodes.size() >= 2);
@@ -395,8 +397,8 @@ void AclCppOpGenerator::visit(INode* node, ops::EluOp& op) {
 template <typename Op>
 void AclCppOpGenerator::genConvolution(INode* node, Op& op, const string& acl_func_name,
                                        const string& suffix) {
-  const TensorVariant& ir_weights = op.getKernel();
-  const Shape& ir_weights_shape = ir_weights.getShape();
+  auto ir_weights = transposeTensor<1, 0, 2, 3>(make_shared<TensorVariant>(op.getKernel()));
+  const Shape& ir_weights_shape = ir_weights->getShape();
   assert(ir_weights_shape.rank() == 4);
   Shape ir_biases_shape({ir_weights_shape.dim(-1)});
   const Shape& strides = op.getStrides();
@@ -413,14 +415,14 @@ void AclCppOpGenerator::genConvolution(INode* node, Op& op, const string& acl_fu
   auto in = AF::id(tensorName(in_node));
 
   // Create the output tensor in the DOM.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, transposeShape<1, 0, 2>(op.getOutputShape(0)));
   string operation_name = out->name() + suffix;
 
   // Generate a tensor for weights (kernel) in the DOM.
   auto weights = genTensor(operation_name + "_weights", ir_weights_shape);
 
   // Save the IR weights tensor to later read this in the artifact.
-  serializeTensor<3, 2, 1, 0>(ir_weights);
+  serializeTensor(*ir_weights);
   _constrBlock->call("deserializeTensor", {_parIn, weights});
 
   // Create a local variable of type PadStrideInfo in the artifact constructor:
@@ -465,7 +467,7 @@ void AclCppOpGenerator::genActivation(INode* node, OpDescription& op,
   auto in = AF::id(tensorName(in_node));
 
   // Create the output tensor in the DOM and return its id.
-  auto out = genTensor(node, op);
+  auto out = genTensor(node, op.getOutputShape(0));
   auto prefix = out->name() + "_activation_layer";
 
   // Create an instance of the ActivationLayerInfo class as a local variable in the artifact
@@ -616,14 +618,14 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name, const Sh
   return id;
 }
 
-shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(INode* node, OpDescription& op) {
+shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(INode* node, const Shape& ir_shape) {
   if (node->getPrevNodes().empty())
     _inputs.insert(node);
 
   if (node->getNextNodes().empty())
     _outputs.insert(node);
 
-  return genTensor(tensorName(node), op.getOutputShape(0), !node->getName().empty());
+  return genTensor(tensorName(node), ir_shape, !node->getName().empty());
 }
 
 void AclCppOpGenerator::genNamed() {
@@ -642,14 +644,38 @@ void AclCppOpGenerator::genNamed() {
   }
 }
 
-template<unsigned int... Ints>
 void AclCppOpGenerator::serializeTensor(const TensorVariant& tensor) {
-  shared_ptr<TensorVariant> to_tranpose = make_shared<TensorVariant>(tensor);
-  shared_ptr<TensorVariant> transposed = transposeTensor<Ints...>(to_tranpose);
-  const Shape& shape = transposed->getShape();
+  const Shape& shape = tensor.getShape();
+  Index coords;
+  coords.resize(shape.rank());
+  Index dimensions;
+  dimensions.resize(shape.rank());
+
+  for (int i = 0; i < shape.rank(); ++i) {
+    coords.at(i) = 0;
+    dimensions.at(i) = shape.dim(i);
+  }
+
+  for (;;) {
+    _parOut.write(tensor.at(coords), tensor.getElementSize());
+    bool stop = true;
+    int i;
 
-  for (auto& idx: ShapeRange(shape))
-    _parOut.write(tensor.at(idx), sizeof(float));
+    for (i = 0; i < shape.rank(); ++i) {
+      if(coords.at(i) < dimensions.at(i) - 1) {
+        ++coords.at(i);
+        stop = false;
+        break;
+      }
+    }
+
+    if (stop) {
+      break;
+    } else {
+      for (int j = 0; j < i; ++j)
+        coords.at(j) = 0;
+    }
+  }
 }
 
 void AclCppOpGenerator::visit(INode* node, ops::SqueezeOp& op) {
-- 
2.7.4