[nnc] Make Mobile and Inception-v3 working (#2416)
authorТимур Отеллович Аблязимов/AI Tools Lab /SRR/Staff Engineer/삼성전자 <t.ablyazimov@samsung.com>
Thu, 29 Nov 2018 10:21:08 +0000 (13:21 +0300)
committerРоман Михайлович Русяев/AI Tools Lab /SRR/Staff Engineer/삼성전자 <r.rusyaev@samsung.com>
Thu, 29 Nov 2018 10:21:08 +0000 (13:21 +0300)
These commit starts a sequence of commits introducing the debugged version of the ACL backend operations used in the Mobilenet and Inception-v3 networks.

Signed-off-by: Timur Ablyazimov <t.ablyazimov@samsung.com>
contrib/nnc/include/core/modelIR/TensorUtil.h
contrib/nnc/include/passes/acl_soft_backend/AclCppOpGenerator.h
contrib/nnc/include/passes/acl_soft_backend/ArtifactModel.h
contrib/nnc/passes/acl_soft_backend/AclArtifactUtilities.in
contrib/nnc/passes/acl_soft_backend/AclCppOpGenerator.cpp
contrib/nnc/passes/acl_soft_backend/ArtifactGeneratorCppCode.cpp
contrib/nnc/passes/acl_soft_backend/ArtifactGeneratorCppDecl.cpp
contrib/nnc/passes/acl_soft_backend/ArtifactModel.cpp

index dfbe81c..683652d 100644 (file)
@@ -30,9 +30,18 @@ namespace nnc
 namespace mir
 {
 
+// TODO: This is potentialy unsafe. Consider how to improve the transposition concept.
 template<unsigned int... Ints>
 Shape transposeShape(const Shape& shape) {
-  Shape result{shape.dim(Ints)...};
+  std::vector<unsigned int> permutes{Ints...};
+  Shape result(shape);
+    int32_t nof_permutes = std::min<int32_t>(shape.rank(), permutes.size());
+
+  for (int32_t i = 0; i < nof_permutes; ++i) {
+    if (permutes[i] < nof_permutes)
+      result.dim(i) = shape.dim(permutes[i]);
+  }
+
   return result;
 }
 
index c8c2e7e..ff0037c 100644 (file)
@@ -140,7 +140,7 @@ private:
   /**
    * @brief Generates a unique name for the tensor.
    */
-  std::string tensorName(mir::Operation* op) const;
+  std::string tensorName(const mir::Operation* op) const;
 
   /**
    * @brief Generates tensor shape in DOM.
@@ -177,13 +177,75 @@ private:
   void genNamed();
 
   /**
-   * @brief Serializes a tensor.
+   * @brief Schedule a tensor serialization.
+   * @param tensor_id - an artifact ID of the tensor.
+   * @param ir_tensor - the IR source of the tensor.
+   */
+  void serializeTensor(std::shared_ptr<ArtifactId> tensor_id, const mir::TensorVariant& ir_tensor);
+
+  /**
+   * @brief Serialize an IR tensor in a file.
    * @param tensor - tensor to serialize.
    */
-  void serializeTensor(const mir::TensorVariant& tensor);
+  void serializeIRTensor(const mir::TensorVariant& tensor);
+
+  /**
+   * @brief Generate the deserialization calls in right places in the artifact.
+   */
+  void genDeserializations();
+
+  /**
+   * @brief Generate procedure calls for filling tensors with constant scalar values.
+   */
+  void genFillings();
+
+  /**
+   * @brief Store the tensor ID and its value for the successive generation (for uniform tensors).
+   * @param tensor_id - ID of the tensor.
+   * @param val - its value.
+   */
+  void fillTensor(std::shared_ptr<ArtifactId> tensor_id, const std::string& val);
+
+  /**
+   * @brief Schedule the tensor allocation.
+   * @param tensor_id - ID of the scheduled tensor.
+   */
+  void allocate(std::shared_ptr<ArtifactId> tensor_id);
+
+  /**
+   * @brief Generate all the scheduled tensor allocations.
+   */
+  void genAllocates();
+
+  /**
+   * @brief Generate the layer declaration and the configure() call.
+   * @param layer_type - ACL layer type.
+   * @param layer_name - name of the layer variable in the artifact.
+   * @param config_params - input/output tensor names and the other configuration information.
+   * @return - generated tensor ID.
+   */
+  std::shared_ptr<ArtifactId> genLayer(const std::string& layer_type, const std::string& layer_name,
+                const std::list<std::shared_ptr<ArtifactExpr>>& config_params);
+
+  /**
+   * @brief Generate the layer run() call.
+   * @param layer_id - layer ID.
+   */
+  void runLayer(std::shared_ptr<ArtifactId> layer_id);
 
+  /**
+   * @brief Input nodes.
+   */
   std::set<mir::Operation*> _inputs;
+
+  /**
+   * @brief Output nodes.
+   */
   std::set<mir::Operation*> _outputs;
+
+  /**
+   * @brief All named tensors names.
+   */
   std::set<std::string> _tensorNames;
 
   /**
@@ -202,11 +264,6 @@ private:
   std::shared_ptr<ArtifactClass> _artifactClass;
 
   /**
-   * @brief The artifact constructor.
-   */
-  std::shared_ptr<ArtifactClassFunction> _constructor;
-
-  /**
    * @brief The artifact inference function.
    */
   std::shared_ptr<ArtifactClassFunction> _inferenceFunction;
@@ -235,6 +292,21 @@ private:
    * @brief The CLScheduler class representation in the DOM.
    */
   std::shared_ptr<ArtifactId> _clScheduler;
+
+  /**
+   * Tensors which need to be allocated in the artifact.
+   */
+  std::list<std::shared_ptr<ArtifactId>> _allocates;
+
+  /**
+   * Tensors which are serialized from the Model IR and need to be deserialized in the artifact.
+   */
+  std::list<std::shared_ptr<ArtifactId>> _serializations;
+
+  /**
+   * Tensors which must be filled with constant values and the corresponding values.
+   */
+  std::list<std::pair<std::shared_ptr<ArtifactId>, std::string>> _fillings;
 };
 
 } // namespace nnc
index a7aee1a..eae9d84 100644 (file)
@@ -195,6 +195,8 @@ private:
 enum class ArtifactUnOp {
   preIncr,
   preDecr,
+  heapNew,
+  heapFree,
   postIncr,
   postDecr
 };
@@ -440,6 +442,20 @@ public:
   std::shared_ptr<ArtifactBinaryExpr> bin(ArtifactBinOp op, std::shared_ptr<ArtifactExpr> left,
                                           std::shared_ptr<ArtifactExpr> right);
 
+  /**
+   * @brief Creates a heap new operation expression.
+   * @param expr
+   * @return
+   */
+  std::shared_ptr<ArtifactUnaryExpr> heapNew(std::shared_ptr<ArtifactExpr> expr);
+
+  /**
+   * @brief Creates a heap free operation expression.
+   * @param expr
+   * @return
+   */
+  std::shared_ptr<ArtifactUnaryExpr> heapFree(std::shared_ptr<ArtifactExpr> expr);
+
 private:
   std::list<std::shared_ptr<ArtifactEntity>> _statements;
 };
@@ -599,7 +615,7 @@ public:
                                 const std::list<std::shared_ptr<ArtifactExpr>>& initializers = {}) {
     if (is_public) {
       auto var = std::make_shared<ArtifactClassVariable>(this, type_name, var_name, dimensions,
-        initializers);
+                                                         initializers);
       _publicVariables.push_back(var);
       return var;
     } else {
@@ -649,11 +665,20 @@ public:
     return _privateFunctions;
   }
 
+  ArtifactBlock* getConstrBlock() {
+    return &_constrBlock;
+  }
+
+  const ArtifactBlock* getConstrBlock() const {
+    return &_constrBlock;
+  }
+
 private:
     std::list<std::shared_ptr<ArtifactClassVariable>> _publicVariables;
     std::list<std::shared_ptr<ArtifactClassVariable>> _privateVariables;
     std::list<std::shared_ptr<ArtifactClassFunction>> _publicFunctions;
     std::list<std::shared_ptr<ArtifactClassFunction>> _privateFunctions;
+    ArtifactBlock _constrBlock;
 };
 
 /**
@@ -791,6 +816,24 @@ public:
                                             std::shared_ptr<ArtifactExpr> ind) {
     return std::make_shared<ArtifactIndex>(expr, ind);
   }
+
+  /**
+   * @brief Creates a heap new operation expression.
+   * @param expr
+   * @return
+   */
+  static std::shared_ptr<ArtifactUnaryExpr> heapNew(std::shared_ptr<ArtifactExpr> expr) {
+    return std::make_shared<ArtifactUnaryExpr>(ArtifactUnOp::heapNew, expr);
+  }
+
+  /**
+   * @brief Creates a heap free operation expression.
+   * @param expr
+   * @return
+   */
+  static std::shared_ptr<ArtifactUnaryExpr> heapFree(std::shared_ptr<ArtifactExpr> expr) {
+    return std::make_shared<ArtifactUnaryExpr>(ArtifactUnOp::heapFree, expr);
+  }
 };
 
 } // namespace nnc
index e68bd48..b6ce150 100644 (file)
@@ -1,8 +1,6 @@
 static void initializeTensor(arm_compute::CLTensor& tensor, const arm_compute::TensorShape& ts) {
-  arm_compute::TensorInfo ti;
-  ti.init_auto_padding(ts, arm_compute::Format::F32);
+  arm_compute::TensorInfo ti(ts, arm_compute::Format::F32);
   tensor.allocator()->init(ti);
-  tensor.allocator()->allocate();
 }
 
 static void fillTensor(arm_compute::CLTensor& tensor, float scalar) {
index d593b91..c8fe372 100644 (file)
@@ -32,18 +32,26 @@ using namespace std;
 using namespace mir;
 
 AclCppOpGenerator::AclCppOpGenerator(const string& name, ostream& par_out)
-    : _parOut(par_out), _module(name), _clScheduler(AF::id("arm_compute::CLScheduler")) {}
+  : _parOut(par_out), _module(name), _clScheduler(AF::id("arm_compute::CLScheduler")) {}
 
 const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
+  // Including headers.
   _module.addHeaderSysInclude("fstream");
   _module.addHeaderInclude("arm_compute/core/Types.h");
   _module.addHeaderInclude("arm_compute/runtime/CL/CLFunctions.h");
   _module.addHeaderInclude("arm_compute/runtime/CL/CLScheduler.h");
+  _module.addHeaderInclude("arm_compute/runtime/CL/CLBufferAllocator.h");
+  _module.addHeaderInclude("arm_compute/runtime/BlobLifetimeManager.h");
+  _module.addHeaderInclude("arm_compute/runtime/PoolManager.h");
+  _module.addHeaderInclude("arm_compute/runtime/MemoryManagerOnDemand.h");
+
+  // The general structure creation.
   _artifactClass = _module.createClass(_module.name());
-  _constructor = _artifactClass->func(true, "", _module.name());
-  _constrBlock = _constructor->getBlock();
+  _constrBlock = _artifactClass->getConstrBlock();
   _inferenceFunction = _artifactClass->func(true, "void", "Inference");
   _infBlock = _inferenceFunction->getBlock();
+
+  // Input parameter stream preparation.
   _parInVar = _artifactClass->var(false, "std::ifstream", "_parIn");
   _parIn = _parInVar->use();
   string par_file_name = cli::artifactName + ".par";
@@ -54,8 +62,14 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
   file_fail_block->addStatement(AF::lit("throw std::string(\"Failed to open file: " +
                                         par_file_name + " for reading\")"));
 
+  // Traverse the computational graph.
   g->accept(this);
+
+  // Generate all the deferred entities.
   genNamed();
+  genAllocates();
+  genDeserializations();
+  genFillings();
 
   // Make sure all the OpenCL jobs are done executing:
   _infBlock->call("sync", {}, AF::call("get", {}, _clScheduler, ArtifactCallType::scope));
@@ -64,13 +78,13 @@ const ArtifactModule& AclCppOpGenerator::generate(mir::Graph* g) {
 }
 
 void AclCppOpGenerator::visit(ops::ConcatOp& op) {
-  static const char* axis_names[] = {"arm_compute::DataLayoutDimension::CHANNEL",
-                                     "arm_compute::DataLayoutDimension::HEIGHT",
+  static const char* axis_names[] = {"arm_compute::DataLayoutDimension::HEIGHT",
                                      "arm_compute::DataLayoutDimension::WIDTH",
-                                     "arm_compute::DataLayoutDimension::BATCHES"};
+                                     "arm_compute::DataLayoutDimension::CHANNEL"};
 
-  assert(op.getAxis() < sizeof(axis_names) / sizeof(const char*));
-  auto out = genTensor(op, op.getOutputShape(0));
+  int axis = op.getAxis() < 0 ? op.getOutputShape(0).rank() + op.getAxis() : op.getAxis();
+  assert(axis < sizeof(axis_names) / sizeof(const char*));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
   auto prefix = out->name() + "_concatenate_layer";
   auto inputs_var = _constrBlock->var("std::vector<arm_compute::ICLTensor*>", prefix + "_inputs");
   auto inputs = inputs_var->use();
@@ -78,11 +92,11 @@ void AclCppOpGenerator::visit(ops::ConcatOp& op) {
   for (auto i : op.getPrevNodes())
     _constrBlock->call("push_back", {AF::ref(AF::id(tensorName(i.op)))}, inputs);
 
-  auto concat_layer_var = _artifactClass->var(false, "arm_compute::CLConcatenateLayer", prefix);
-  auto concat_layer = concat_layer_var->use();
-  _constrBlock->call("configure", {inputs, AF::ref(out), AF::lit(axis_names[op.getAxis()])},
-                     concat_layer);
-  _infBlock->call("run", {}, concat_layer);
+  auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
+                        {inputs, AF::ref(out), AF::lit(axis_names[axis])});
+
+  allocate(out);
+  runLayer(layer);
 }
 
 void AclCppOpGenerator::visit(ops::Conv2DOp& op) {
@@ -90,8 +104,7 @@ void AclCppOpGenerator::visit(ops::Conv2DOp& op) {
 }
 
 void AclCppOpGenerator::visit(ops::DepthwiseConv2DOp& op) {
-  genConvolution(op, "arm_compute::CLDepthwiseConvolutionLayer",
-                 "_depthwise_convolution_layer");
+  genConvolution(op, "arm_compute::CLDepthwiseConvolutionLayer", "_depthwise_convolution_layer");
 }
 
 void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
@@ -99,12 +112,70 @@ void AclCppOpGenerator::visit(ops::SoftmaxOp& op) {
   assert(in_ops.size() == 1);
   auto in_op = in_ops[0].op;
   auto in = AF::id(tensorName(in_op));
-  auto out = genTensor(op, op.getOutputShape(0));
-  auto sm_layer_var = _artifactClass->var(false, "arm_compute::CLSoftmaxLayer",
-                                          out->name() + "_softmax_layer");
-  auto sm_layer = sm_layer_var->use();
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(out)}, sm_layer);
-  _infBlock->call("run", {}, sm_layer);
+
+  int rank = op.getOutputShape(0).rank();
+  // CLPermute does not support all kinds of permutations now.
+  // rank can be more than 2 in our models, so we can not use CLTranspose.
+  // This means we can support tensors with no more then one axis > 1.
+  int axis = op.getAxis() < 0 ? rank + op.getAxis() : op.getAxis();
+  assert(axis == rank - 1);
+  int nof_long_axes = 0;
+
+  for (int i = 0; i < rank; ++i) {
+    if (op.getOutputShape(0).dim(i) > 1)
+      ++nof_long_axes;
+  }
+
+  // TODO: Consider how to support Softmax on more general inputs.
+  if (nof_long_axes > 1)
+    throw AclCppException("Unsupported Softmax operation with several dimensions greater than 1");
+
+  // Create the output tensor.
+  Shape in_out_shape(op.getOutputShape(0));
+  Shape sm_shape(in_out_shape);
+
+  if (axis != 0) {
+    int sm_dim = sm_shape.dim(axis);
+    sm_shape.dim(axis) = sm_shape.dim(0);
+    sm_shape.dim(0) = sm_dim;
+  }
+
+  auto out = genTensor(op, in_out_shape);
+  auto prefix = out->name();
+
+  if (axis == 0) {
+    // Simple version: do not need pre and post reshapes.
+    // Apply the softmax operaion.
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
+                       {AF::ref(in), AF::ref(out)});
+    allocate(out);
+    runLayer(sm);
+  } else {
+    // Need to reshape before the Softmax application and after it.
+    // Then we need two tensors for intermediate results. This is because we do a couple of auxiliary
+    // reshapes: one to transform the input tensor to a unidimensional tensor and the second to
+    // transorm the result of the softmax operation back to the original form.
+    auto tmp = genTensor(prefix + "_tmp", sm_shape);
+    auto tmp2 = genTensor(prefix + "_tmp2", sm_shape);
+
+    // Do the input permutation.
+    auto transp1 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer1",
+                          {AF::ref(in), AF::ref(tmp)});
+    allocate(tmp);
+    runLayer(transp1);
+
+    // Apply the softmax operaion.
+    auto sm = genLayer("arm_compute::CLSoftmaxLayer", prefix + "_softmax_layer",
+                       {AF::ref(tmp), AF::ref(tmp2)});
+    allocate(tmp2);
+    runLayer(sm);
+
+    // Reshape the output to the original form.
+    auto transp2 = genLayer("arm_compute::CLReshapeLayer", prefix + "_transp_layer2",
+                          {AF::ref(tmp2), AF::ref(out)});
+    allocate(out);
+    runLayer(transp2);
+  }
 }
 
 void AclCppOpGenerator::visit(ops::PoolOp& op) {
@@ -126,29 +197,29 @@ void AclCppOpGenerator::visit(ops::PoolOp& op) {
 
   auto in_op = prev_nodes[0].op;
   auto in = AF::id(tensorName(in_op));
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
   auto prefix = out->name() + "_pooling_layer";
 
   auto pad_stride_info_var = _constrBlock->var("arm_compute::PadStrideInfo",
                                                prefix + "_pad_stride_info",
-                                               {}, {AF::lit(to_string(op.getStrides().dim(0))),
-                                                    AF::lit(to_string(op.getStrides().dim(1))),
-                                                    AF::lit(to_string(op.getPadding(0))),
-                                                    AF::lit(to_string(op.getPadding(1)))});
+                                               {}, {AF::lit(to_string(op.getStrides().dim(1))),
+                                                    AF::lit(to_string(op.getStrides().dim(0))),
+                                                    AF::lit(to_string(op.getPadding(1))),
+                                                    AF::lit(to_string(op.getPadding(0)))});
   auto pad_stride_info = pad_stride_info_var->use();
   auto kernel_window_var = _constrBlock->var("arm_compute::Size2D", prefix + "_kernel_window", {},
-                                             {AF::lit(to_string(op.getWindowShape().dim(0))),
-                                              AF::lit(to_string(op.getWindowShape().dim(1)))});
+                                             {AF::lit(to_string(op.getWindowShape().dim(1))),
+                                              AF::lit(to_string(op.getWindowShape().dim(0)))});
   auto kernel_window = kernel_window_var->use();
-  auto pooling_info_var = _constrBlock->var("arm_compute::PoolingLayerInfo",
-                                            prefix + "_pooling_info", {}, {AF::lit(pooling_type),
-                                            kernel_window, pad_stride_info});
+  auto pooling_info_var = _constrBlock->var(
+                "arm_compute::PoolingLayerInfo", prefix + "_pooling_info", {},
+                {AF::lit(pooling_type), kernel_window, pad_stride_info,
+                 AF::lit(op.getBorderType() == ops::PoolOp::BorderType::EMPTY ? "true" : "false")});
   auto pooling_info = pooling_info_var->use();
-
-  auto pooling_layer_var = _artifactClass->var(false, "arm_compute::CLPoolingLayer", prefix);
-  auto pooling_layer = pooling_layer_var->use();
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(out), pooling_info}, pooling_layer);
-  _infBlock->call("run", {}, pooling_layer);
+  auto layer = genLayer("arm_compute::CLPoolingLayer", prefix,
+                        {AF::ref(in), AF::ref(out), pooling_info});
+  allocate(out);
+  runLayer(layer);
 }
 
 void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
@@ -163,27 +234,20 @@ void AclCppOpGenerator::visit(ops::FullyConnectedOp& op) {
   auto in = AF::id(tensorName(in_op));
 
   // Create the output tensor in the DOM.
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
   string operation_name = out->name() + "_fully_connected_layer";
 
   // Create the weights tensor in the DOM and use its id.
   auto weights = genTensor(operation_name + "_weights", ir_weights_shape);
 
-  // Serialize the weights tensor and generate the function to deserialize it in the artifact.
-  serializeTensor(ir_weights);
-  _constrBlock->call("deserializeTensor", {_parIn, weights});
-
   // Instantiate the CLFullyConnectedLayer object.
-  auto fully_layer_var = _artifactClass->var(false, "arm_compute::CLFullyConnectedLayer",
-                                             operation_name);
-  auto fully_layer = fully_layer_var->use();
-
-  // Call the: fully_layer.configure(&in, &weights, nullptr, &out);
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(weights), AF::lit("nullptr"), AF::ref(out)},
-                     fully_layer);
-
-  // Call the: fully_layer.run();
-  _infBlock->call("run", {}, fully_layer);
+  auto layer = genLayer("arm_compute::CLFullyConnectedLayer", operation_name,
+                        {AF::ref(in), AF::ref(weights), AF::lit("nullptr"), AF::ref(out)});
+  allocate(weights);
+  // Serialize the weights tensor and generate the function to deserialize it in the artifact.
+  serializeTensor(weights, ir_weights);
+  allocate(out);
+  runLayer(layer);
 }
 
 void AclCppOpGenerator::visit(ops::CappedReluOp& op) {
@@ -202,7 +266,7 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
   auto in = AF::id(tensorName(in_op));
 
   // Create the output tensor in the DOM and obtain its identifier.
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
 
   // Prefix used for the name of variables related to the operation implementation.
   string operation_name = out->name() + "_bias_add_layer";
@@ -219,25 +283,20 @@ void AclCppOpGenerator::visit(ops::BiasAddOp& op) {
   ir_biases_shape.dim(-1) = ir_biases.getShape().dim(0);
   auto biases = genTensor(operation_name + "_biases", ir_biases_shape);
 
-  // Save the IR biases tensor to later read this in the artifact.
-  serializeTensor(ir_biases);
-  _constrBlock->call("deserializeTensor", {_parIn, biases});
-
   // Instantiate the CLArithmeticAddition object.
-  auto arithmetic_add_layer_var = _artifactClass->var(false, "arm_compute::CLArithmeticAddition",
-                                                      operation_name);
-  auto arithmetic_add_layer = arithmetic_add_layer_var->use();
-
-  // Call the: arithmetic_add_layer.configure(&in, &biases, &out);
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(biases), AF::ref(out)},
-                     arithmetic_add_layer);
-
-  // Call the: arithmetic_add_layer.run();
-  _infBlock->call("run", {}, arithmetic_add_layer);
+  auto layer = genLayer("arm_compute::CLArithmeticAddition", operation_name,
+                        {AF::ref(in), AF::ref(biases), AF::ref(out),
+                         AF::lit("arm_compute::ConvertPolicy::WRAP")});
+  allocate(biases);
+  // Save the IR biases tensor to later read this in the artifact.
+  serializeTensor(biases, ir_biases);
+  allocate(out);
+  runLayer(layer);
 }
 
 void AclCppOpGenerator::visit(ops::VariableOp& op) {
-  genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
+  auto tensor = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
+  allocate(tensor);
 }
 
 void AclCppOpGenerator::visit(ops::ReluOp& op) {
@@ -253,18 +312,13 @@ void AclCppOpGenerator::visit(ops::ReshapeOp& op) {
   auto in = AF::id(tensorName(in_op));
 
   // Create the output tensor in the DOM and return its id.
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
 
   // Create an instance of the CLReshapeLayer class as a member of the artifact class.
-  auto reshape_layer_var = _artifactClass->var(false, "arm_compute::CLReshapeLayer",
-                                               out->name() + "_reshape_layer");
-  auto reshape_layer = reshape_layer_var->use();
-
-  // Generate the call: reshape_layer.configure(&in, &out);
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(out)}, reshape_layer);
-
-  // Generate the call: reshape_layer.run();
-  _infBlock->call("run", {}, reshape_layer);
+  auto layer = genLayer("arm_compute::CLReshapeLayer", out->name() + "_reshape_layer",
+                        {AF::ref(in), AF::ref(out)});
+  allocate(out);
+  runLayer(layer);
 }
 
 void AclCppOpGenerator::visit(ops::ScaleOp& op) {
@@ -278,41 +332,51 @@ void AclCppOpGenerator::visit(ops::ScaleOp& op) {
   auto in = AF::id(tensorName(in_op));
 
   // Generate output tensor description in the DOM.
-  auto out = genTensor(op, op.getOutputShape(0));
-  auto prefix = out->name() + "_scale_layer";
-
-  // Create a CLPixelWiseMultiplication instance.
-  auto scale_layer_var = _artifactClass->var(false, "arm_compute::CLPixelWiseMultiplication",
-                                             prefix);
-  auto scale_layer = scale_layer_var->use();
-  auto scale_tensor = genTensor(prefix + "_scales", in_op->getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
+  auto operation_name = out->name() + "_scale_layer";
 
-  // Construct the vector containing scales.
-  auto scales_var = _constrBlock->var("std::vector<float>", prefix + "_scales");
-  auto scales = scales_var->use();
   const auto& ir_scales = op.getWeights();
 
-  Tensor<float> scale_access(ir_scales);
-
-  for (auto& idx: ShapeRange(ir_scales.getShape())) {
-    float v = scale_access.at(idx);
-    _constrBlock->call("push_back", {AF::lit(to_string(v))}, scales);
-  }
+  // Reshape the IR scales tensor and generate the corresponding DOM tensor.
+  Shape ir_scales_shape;
+  const auto ir_input_shape = transposeShape<1, 0, 2>(op.getInputShape(0));
+  ir_scales_shape.resize(ir_input_shape.rank());
 
-  int dim = op.getInputShape(0).rank() - 1;
+  // ACL CLArithmeticDivision supports input tensors broadcasting.
+  for (int i = 0; i < ir_input_shape.rank() - 1; ++i)
+    ir_scales_shape.dim(i) = 1;
 
-  // Call the: fillTensorScales(scale_tensor, dim, scales);
-  _constrBlock->call("fillTensorScales", {scale_tensor, AF::lit(to_string(dim)), scales});
+  ir_scales_shape.dim(-1) = ir_scales.getShape().dim(0);
+  auto scales = genTensor(operation_name + "_scales", ir_scales_shape);
 
-  // Call the: scale_layer.configure(&in, &scale_tensor, &out, 1, ConvertPolicy::WRAP,
-  //                                 RoundingPolicy::TO_NEAREST_EVEN);
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(scale_tensor), AF::ref(out), AF::lit("1"),
-                                   AF::lit("arm_compute::ConvertPolicy::WRAP"),
-                                   AF::lit("arm_compute::RoundingPolicy::TO_NEAREST_EVEN")},
-                     scale_layer);
+  // We do not use the genMultiplication() function here because the input needs broadcasting.
 
-  // Call the: scale_layer.run();
-  _infBlock->call("run", {}, scale_layer);
+  // Create a unit tensor in the DOM.
+  auto unit = genTensor(operation_name + "_unit", ir_input_shape);
+
+  // Create a tmp tensor in the DOM to store the result of 1 / scale.
+
+  auto tmp = genTensor(operation_name + "_tmp", ir_input_shape);
+
+  // Create an instance of the CLArithmeticDivision class as a member of the artifact class.
+  auto layer1 = genLayer("arm_compute::CLArithmeticDivision",
+                         operation_name + "_arithmetic_div_layer_1",
+                         {AF::ref(unit), AF::ref(scales), AF::ref(tmp)});
+  runLayer(layer1);
+
+  // Create an instance of the CLArithmeticDivision class as a member of the artifact class.
+  auto layer2 = genLayer("arm_compute::CLArithmeticDivision",
+                         operation_name + "_arithmetic_div_layer_2",
+                         {AF::ref(in), AF::ref(tmp), AF::ref(out)});
+  allocate(scales);
+  // Save the IR scales tensor to later read this in the artifact.
+  serializeTensor(scales, ir_scales);
+  allocate(unit);
+  // Fill the unit tensor with the 1 value.
+  fillTensor(unit, "1");
+  allocate(tmp);
+  allocate(out);
+  runLayer(layer2);
 }
 
 void AclCppOpGenerator::visit(ops::BatchNormOp& op) {
@@ -402,10 +466,10 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
   const Shape& ir_weights_shape = ir_weights->getShape();
   assert(ir_weights_shape.rank() == 4);
   Shape ir_biases_shape({ir_weights_shape.dim(-1)});
-  const Shape& strides = op.getStrides();
+  const Shape& strides = transposeShape<1, 0>(op.getStrides());
   assert(strides.rank() == 3 && strides.dim(2) == 1);
-  uint32_t pad_x = op.getPadding(0);
-  uint32_t pad_y = op.getPadding(1);
+  uint32_t pad_x = op.getPadding(1);
+  uint32_t pad_y = op.getPadding(0);
   assert(op.getPadding(2) == 0);
 
   auto& prev_nodes = op.getPrevNodes();
@@ -422,24 +486,16 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
   // Generate a tensor for weights (kernel) in the DOM.
   auto weights = genTensor(operation_name + "_weights", ir_weights_shape);
 
-  // Save the IR weights tensor to later read this in the artifact.
-  serializeTensor(*ir_weights);
-  _constrBlock->call("deserializeTensor", {_parIn, weights});
-
   // Create a local variable of type PadStrideInfo in the artifact constructor:
   // PadStrideInfo pad_stride_info(stride_x, stride_y, pad_x, pad_y);
   auto pad_stride_info_var = _constrBlock->var("arm_compute::PadStrideInfo",
                                                operation_name + "_pad_stride_info",
                                                {}, {AF::lit(to_string(strides.dim(0))),
-                                                    AF::lit(to_string(strides.dim(1))),
-                                                    AF::lit(to_string(pad_x)),
-                                                    AF::lit(to_string(pad_y))});
+                                               AF::lit(to_string(strides.dim(1))),
+                                               AF::lit(to_string(pad_x)),
+                                               AF::lit(to_string(pad_y))});
   auto pad_stride_info = pad_stride_info_var->use();
 
-  // Create the convolution (/depthwise convolution/deconvolution) layer class instance.
-  auto conv_layer_var = _artifactClass->var(false, acl_func_name, operation_name);
-  auto conv_layer = conv_layer_var->use();
-
   // The parameter for the conv_layer.config(&in, &weights, nullptr, &out, pad_stride_info)
   // function call.
   list<shared_ptr<ArtifactExpr>> config_params{AF::ref(in), AF::ref(weights), AF::lit("nullptr"),
@@ -451,14 +507,17 @@ void AclCppOpGenerator::genConvolution(Op& op, const string& acl_func_name, cons
     config_params.push_back(AF::lit("0"));
   }
 
-  // Call the: conv_layer(&in, &weights, nullptr, &out, pad_stride_info(, 0, 0 - for deconv));
-  _constrBlock->call("configure", config_params, conv_layer);
-
-  // Call the: conv_layer.run();
-  _infBlock->call("run", {}, conv_layer);
+  // Create the convolution (/depthwise convolution/deconvolution) layer class instance.
+  auto layer = genLayer(acl_func_name, operation_name, config_params);
+  allocate(weights);
+  // Save the IR weights tensor to later read this in the artifact.
+  serializeTensor(weights, *ir_weights);
+  allocate(out);
+  runLayer(layer);
 }
 
-void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& activation_name, float a, float b) {
+void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& activation_name,
+                                      float a, float b) {
   auto &prev_nodes = op.getPrevNodes();
   assert(prev_nodes.size() == 1);
 
@@ -467,7 +526,7 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
   auto in = AF::id(tensorName(in_op));
 
   // Create the output tensor in the DOM and return its id.
-  auto out = genTensor(op, op.getOutputShape(0));
+  auto out = genTensor(op, transposeShape<1, 0, 2>(op.getOutputShape(0)));
   auto prefix = out->name() + "_activation_layer";
 
   // Create an instance of the ActivationLayerInfo class as a local variable in the artifact
@@ -480,15 +539,10 @@ void AclCppOpGenerator::genActivation(mir::Operation& op, const std::string& act
   auto activation_info = activation_info_var->use();
 
   // Create an instance of the CLActivationLayer class as a member of the artifact class.
-  auto activation_layer_var = _artifactClass->var(false, "arm_compute::CLActivationLayer",
-                                                    prefix);
-  auto activation_layer = activation_layer_var->use();
-
-  // Generate the call: activation_layer.configure(&in, &out, activation_info);
-  _constrBlock->call("configure", {AF::ref(in), AF::ref(out), activation_info}, activation_layer);
-
-  // Generate the call: activation_layer.run();
-  _infBlock->call("run", {}, activation_layer);
+  auto layer = genLayer("arm_compute::CLActivationLayer", prefix,
+                        {AF::ref(in), AF::ref(out), activation_info});
+  allocate(out);
+  runLayer(layer);
 }
 
 shared_ptr<ArtifactId> AclCppOpGenerator::genAddition(const string& prefix, int index,
@@ -540,14 +594,14 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genMultiplication(const string& prefix
   auto unit = genTensor(operation_name + "_unit", ir_unit_shape);
 
   // Fill the unit tensor with the 1 value.
-  _constrBlock->call("fillTensor", {unit, AF::lit("1")});
+  fillTensor(unit, "1");
 
   // Create a tmp tensor in the DOM to store the result of 1 / in2.
   auto tmp = genTensor(operation_name + "_tmp", ir_shape);
 
-  // Create an instance of the CLActivationLayer class as a member of the artifact class.
+  // Create an instance of the CLArithmeticDivision class as a member of the artifact class.
   auto arithmetic_div_layer_var1 = _artifactClass->var(false, "arm_compute::CLArithmeticDivision",
-                                                      operation_name + "_arithmetic_add_layer_1");
+                                                      operation_name + "_arithmetic_div_layer_1");
   auto arithmetic_div_layer1 = arithmetic_div_layer_var1->use();
 
   // Generate the call: arithmetic_div_layer1.configure(&unit, &in2, &tmp);
@@ -557,14 +611,14 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genMultiplication(const string& prefix
   // Generate the call: arithmetic_div_layer1.run();
   _infBlock->call("run", {}, arithmetic_div_layer1);
 
-  // Create an instance of the CLActivationLayer class as a member of the artifact class.
+  // Create an instance of the CLArithmeticDivision class as a member of the artifact class.
   auto arithmetic_div_layer_var2 = _artifactClass->var(false, "arm_compute::CLArithmeticDivision",
-                                                       operation_name + "_arithmetic_add_layer_2");
+                                                       operation_name + "_arithmetic_div_layer_2");
   auto arithmetic_div_layer2 = arithmetic_div_layer_var2->use();
 
   // Generate the call: arithmetic_div_layer2.configure(&in1, &tmp, &out);
   _constrBlock->call("configure", {AF::ref(in1), AF::ref(tmp), AF::ref(out)},
-                     arithmetic_div_layer1);
+                     arithmetic_div_layer2);
 
   // Generate the call: arithmetic_div_layer2.run();
   _infBlock->call("run", {}, arithmetic_div_layer2);
@@ -572,7 +626,7 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genMultiplication(const string& prefix
   return out;
 }
 
-string AclCppOpGenerator::tensorName(Operation* op) const {
+string AclCppOpGenerator::tensorName(const Operation* op) const {
   string tensor_name;
 
   if (!op->getName().empty()) {
@@ -618,7 +672,7 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string& name, const Sh
   return id;
 }
 
-std::shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(mir::Operation& op, const Shape& ir_shape) {
+shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(Operation& op, const Shape& ir_shape) {
   if (op.getPrevNodes().empty())
     _inputs.insert(&op);
 
@@ -644,7 +698,13 @@ void AclCppOpGenerator::genNamed() {
   }
 }
 
-void AclCppOpGenerator::serializeTensor(const TensorVariant& tensor) {
+void AclCppOpGenerator::serializeTensor(shared_ptr<ArtifactId> tensor_id,
+                                        const TensorVariant& ir_tensor) {
+  serializeIRTensor(ir_tensor);
+  _serializations.push_back(tensor_id);
+}
+
+void AclCppOpGenerator::serializeIRTensor(const TensorVariant& tensor) {
   const Shape& shape = tensor.getShape();
   Index coords;
   coords.resize(shape.rank());
@@ -657,6 +717,8 @@ void AclCppOpGenerator::serializeTensor(const TensorVariant& tensor) {
   }
 
   for (;;) {
+    float v;
+    memcpy(&v, tensor.at(coords), tensor.getElementSize());
     _parOut.write(tensor.at(coords), tensor.getElementSize());
     bool stop = true;
     int i;
@@ -678,10 +740,47 @@ void AclCppOpGenerator::serializeTensor(const TensorVariant& tensor) {
   }
 }
 
+void AclCppOpGenerator::genDeserializations() {
+  for (auto s : _serializations)
+    _constrBlock->call("deserializeTensor", {_parIn, s});
+}
+
+void AclCppOpGenerator::genFillings() {
+  for (auto f : _fillings)
+    _constrBlock->call("fillTensor", {f.first, AF::lit(f.second)});
+}
+
+void AclCppOpGenerator::fillTensor(shared_ptr<ArtifactId> tensor_id, const string& val) {
+  _fillings.push_back(make_pair(tensor_id, val));
+}
+
 void AclCppOpGenerator::visit(ops::SqueezeOp& op) {
   assert(false && "Unimplemented operation: Squeeze");
 }
 
+void AclCppOpGenerator::allocate(std::shared_ptr<ArtifactId> tensor_id) {
+  _allocates.push_back(tensor_id);
+}
+
+void AclCppOpGenerator::genAllocates() {
+  for (auto a : _allocates)
+    _constrBlock->call("allocate", {}, AF::call("allocator", {}, a), ArtifactCallType::ref);
+}
+
+shared_ptr<ArtifactId> AclCppOpGenerator::genLayer(
+                                              const string& layer_type,
+                                              const string& layer_name,
+                                              const list<shared_ptr<ArtifactExpr>>& config_params) {
+  auto layer_var = _artifactClass->var(false, layer_type, layer_name);
+  auto layer = layer_var->use();
+  _constrBlock->call("configure", config_params, layer);
+  return layer;
+}
+
+void AclCppOpGenerator::runLayer(shared_ptr<ArtifactId> layer_id) {
+  _infBlock->call("run", {}, layer_id);
+}
+
 void AclCppOpGenerator::visit(mir::ops::ResizeOp& op) {
   assert(false && "Unimplemented operation: Resize");
 }
index 2b0c6f4..0a4d499 100644 (file)
@@ -95,7 +95,8 @@ void ArtifactGeneratorCppCode::visit(const ArtifactFunctionCall* node) {
 }
 
 void ArtifactGeneratorCppCode::visit(const ArtifactUnaryExpr* node) {
-  static const char* un_op_str[] = {"++", "--", "++", "--"};
+  // The trailing space is intended in new and delete!
+  static const char* un_op_str[] = {"++", "--", "new ", "delete ", "++", "--"};
 
   if (node->getOp() < ArtifactUnOp::postIncr) {
     _out << un_op_str[static_cast<int>(node->getOp())];
@@ -186,6 +187,27 @@ void ArtifactGeneratorCppCode::visit(const ArtifactFunction* node) {
 }
 
 void ArtifactGeneratorCppCode::visit(const ArtifactClass* node) {
+  // Generate a public default constructor here.
+  _out << node->name() << "::" << node->name() << "()";
+
+  if (!node->privateVariables().empty()) {
+    _out << " : ";
+    bool add_delim = false;
+
+    for (auto v : node->privateVariables()) {
+      if (add_delim)
+        _out << ",\n";
+
+      v->accept(this);
+      add_delim = true;
+    }
+  }
+
+  node->getConstrBlock()->accept(this);
+  _out << endl;
+
+  // Then generate the other stuff.
+
   for (auto e : node->publicFunctions())
     e->accept(this);
 
@@ -194,6 +216,18 @@ void ArtifactGeneratorCppCode::visit(const ArtifactClass* node) {
 }
 
 void ArtifactGeneratorCppCode::visit(const ArtifactClassVariable* node) {
+  _out << node->name() << "(";
+  bool add_comma = false;
+
+  for (auto i : node->getInitializers()) {
+    if (add_comma)
+      _out << ", ";
+
+    i->accept(this);
+    add_comma = true;
+  }
+
+  _out << ")";
 }
 
 void ArtifactGeneratorCppCode::visit(const ArtifactClassFunction* node) {
index b080f5c..a00fddd 100644 (file)
@@ -98,6 +98,11 @@ void ArtifactGeneratorCppDecl::visit(const ArtifactClass* node) {
   _out << "public:" << endl;
   ++_ind;
 
+  // Generate a public default constructor here.
+  _out << _ind << node->name() << "();" << endl;
+
+  // Then generate the other stuff.
+
   for (auto e : node->publicFunctions()) {
     _out << _ind;
     e->accept(this);
index 96291d3..3d037bc 100644 (file)
@@ -95,4 +95,16 @@ shared_ptr<ArtifactBinaryExpr> ArtifactBlock::bin(
   return bin;
 }
 
+shared_ptr<ArtifactUnaryExpr> ArtifactBlock::heapNew(shared_ptr<ArtifactExpr> expr) {
+  auto heap_new = make_shared<ArtifactUnaryExpr>(ArtifactUnOp::heapNew, expr);
+  _statements.push_back(heap_new);
+  return heap_new;
+}
+
+shared_ptr<ArtifactUnaryExpr> ArtifactBlock::heapFree(shared_ptr<ArtifactExpr> expr) {
+  auto heap_del = make_shared<ArtifactUnaryExpr>(ArtifactUnOp::heapFree, expr);
+  _statements.push_back(heap_del);
+  return heap_del;
+}
+
 } // namespace nnc