From 8a17ac1cdcf487a5b9a50a49718f4b412ce2d75f Mon Sep 17 00:00:00 2001
From: =?utf8?q?Ivan=20Vagin/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <ivan.vagin@samsung.com>
Date: Thu, 4 Jul 2019 12:59:38 +0300
Subject: [PATCH] [neurun] Applied shape fixer (#5549)

Applied ShapeFixer, removed ShapeFixer functionality from StageGenerator

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>
---
 runtimes/neurun/backend/acl_cl/StageGenerator.cc   | 123 ---------------------
 runtimes/neurun/backend/acl_neon/StageGenerator.cc |  33 ------
 runtimes/neurun/backend/cpu/StageGenerator.cc      |   7 --
 .../neurun/core/src/compiler/ExecutorFactory.cc    |  37 +++----
 runtimes/neurun/core/src/compiler/PlanBuilder.cc   |  19 ++--
 runtimes/neurun/core/src/compiler/PlanBuilder.h    |   3 +-
 6 files changed, 31 insertions(+), 191 deletions(-)

diff --git a/runtimes/neurun/backend/acl_cl/StageGenerator.cc b/runtimes/neurun/backend/acl_cl/StageGenerator.cc
index 7304c34..eb245a9 100644
--- a/runtimes/neurun/backend/acl_cl/StageGenerator.cc
+++ b/runtimes/neurun/backend/acl_cl/StageGenerator.cc
@@ -473,12 +473,8 @@ void StageGenerator::visit(const model::operation::ConcatNode &node)
   Param param;
 
   param.output_index = ofm_index;
-  _tensor_builder->dimCorrection(ofm_index, false);
   for (const auto &e : node.getInputs())
-  {
     param.input_indexes.emplace_back(e);
-    _tensor_builder->dimCorrection(e, false);
-  }
   param.axis = _ctx.at(axis_index).asScalar<int32_t>();
 
   auto tensors = _tensor_builder;
@@ -555,8 +551,6 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
     UNUSED_RELEASE(feature_size);
     assert(feature_size == batch_size * input_size);
 
-    tensors->dimCorrection(input_index, false);
-
     // for reshaping
     needs_reshape = true;
     reshape.dim(0) = batch_size; /* H */
@@ -618,17 +612,6 @@ void StageGenerator::visit(const model::operation::MulNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-
-    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
-    //      a node to extend shape may be inserted in front of this operation
-    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
@@ -764,9 +747,6 @@ void StageGenerator::visit(const model::operation::ReshapeNode &node)
 
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(input_index, false);
-  tensors->dimCorrection(output_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto output_alloc = tensors->at(param.output_index).get();
     auto input_alloc = tensors->at(param.input_index).get();
@@ -802,9 +782,6 @@ void StageGenerator::visit(const model::operation::SqueezeNode &node)
   Param param{output_index, input_index};
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(input_index, false);
-  tensors->dimCorrection(output_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto output_alloc = tensors->at(param.output_index).get();
     auto input_alloc = tensors->at(param.input_index).get();
@@ -1081,14 +1058,6 @@ void StageGenerator::visit(const model::operation::AddNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
@@ -1136,17 +1105,6 @@ void StageGenerator::visit(const model::operation::SubNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-
-    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
-    //      a node to extend shape may be inserted in front of this operation
-    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
@@ -1194,17 +1152,6 @@ void StageGenerator::visit(const model::operation::DivNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-
-    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
-    //      a node to extend shape may be inserted in front of this operation
-    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   // Construct operation parameters
   struct Param
   {
@@ -1324,17 +1271,6 @@ void StageGenerator::visit(const model::operation::LogicalAndNode &node)
   const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)};
 
-  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
-
-    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
-    //      a node to extend shape may be inserted in front of this operation
-    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
-  }
-
   // Construct operation parameters
   struct Param
   {
@@ -1728,17 +1664,6 @@ void StageGenerator::visit(const model::operation::ComparisonNode &node)
   const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)};
 
-  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
-
-    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
-    //      a node to extend shape may be inserted in front of this operation
-    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
-  }
-
   // Construct operation parameters
   struct Param
   {
@@ -2094,9 +2019,6 @@ void StageGenerator::visit(const model::operation::SpaceToDepthNode &node)
 
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(ofm_index, false);
-  tensors->dimCorrection(ifm_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto ofm_alloc = tensors->at(param.ofm_index).get();
     auto ifm_alloc = tensors->at(param.ifm_index).get();
@@ -2199,9 +2121,6 @@ void StageGenerator::visit(const model::operation::EmbeddingLookupNode &node)
 
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(values_index, false);
-  tensors->dimCorrection(output_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto output_alloc = tensors->at(param.output_index).get();
     auto lookups_alloc = tensors->at(param.lookups_index).get();
@@ -2340,14 +2259,6 @@ void StageGenerator::visit(const model::operation::PReLUNode &node)
   const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)};
 
-  if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank());
-    const_cast<::neurun::model::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
@@ -2501,14 +2412,6 @@ void StageGenerator::visit(const model::operation::LogicalOrNode &node)
   const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)};
 
-  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
-    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
-  }
-
   // Construct operation parameters
   struct Param
   {
@@ -2587,14 +2490,6 @@ void StageGenerator::visit(const model::operation::SquaredDifferenceNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-    const_cast<neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   // Construct operation parameters
   struct Param
   {
@@ -2719,10 +2614,6 @@ void StageGenerator::visit(const model::operation::GatherNode &node)
 
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(ofm_index, false);
-  tensors->dimCorrection(ifm_index, false);
-  tensors->dimCorrection(indices_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto ofm_alloc = tensors->at(param.ofm_index).get();
     auto ifm_alloc = tensors->at(param.ifm_index).get();
@@ -2852,9 +2743,6 @@ void StageGenerator::visit(const model::operation::ArgMaxNode &node)
   assert(axis_shape.rank() == 1);
   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
 
-  _tensor_builder->dimCorrection(ofm_index, false);
-  _tensor_builder->dimCorrection(ifm_index, false);
-
   std::vector<uint32_t> l_axis;
   const int axis_size = axis_shape.num_elements();
   auto axis_base = _ctx.at(axis_index).data().base();
@@ -3302,12 +3190,8 @@ void StageGenerator::visit(const model::operation::SplitNode &node)
   param.axis = acl_common::ToARMComputeAxis(ifm_rank, param.axis).value();
   param.ifm_rank = ifm_rank;
 
-  _tensor_builder->dimCorrection(input_index, false);
   for (const auto &e : node.getOutputs())
-  {
     param.output_indexes.emplace_back(e);
-    _tensor_builder->dimCorrection(e, false);
-  }
 
   auto tensors = _tensor_builder;
 
@@ -3365,12 +3249,8 @@ void StageGenerator::visit(const model::operation::UnpackNode &node)
     param.axis += input_rank;
   param.axis = acl_common::ToARMComputeAxis(input_rank, param.axis).value();
 
-  _tensor_builder->dimCorrection(input_index, false);
   for (const auto &output_index : node.getOutputs())
-  {
     param.output_indexes.emplace_back(output_index);
-    _tensor_builder->dimCorrection(output_index, false);
-  }
 
   auto tensors = _tensor_builder;
 
@@ -3437,9 +3317,6 @@ void StageGenerator::visit(const model::operation::PadNode &node)
 
   auto tensors = _tensor_builder;
 
-  _tensor_builder->dimCorrection(input_index, false);
-  _tensor_builder->dimCorrection(output_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto input = tensors->at(param.input_index).get()->handle();
     auto output = tensors->at(param.output_index).get()->handle();
diff --git a/runtimes/neurun/backend/acl_neon/StageGenerator.cc b/runtimes/neurun/backend/acl_neon/StageGenerator.cc
index 181336c..0da3bdc 100644
--- a/runtimes/neurun/backend/acl_neon/StageGenerator.cc
+++ b/runtimes/neurun/backend/acl_neon/StageGenerator.cc
@@ -436,12 +436,8 @@ void StageGenerator::visit(const model::operation::ConcatNode &node)
 
   Param param;
   param.output_index = ofm_index;
-  _tensor_builder->dimCorrection(ofm_index, false);
   for (const auto &e : node.getInputs())
-  {
     param.input_indexes.emplace_back(e);
-    _tensor_builder->dimCorrection(e, false);
-  }
   param.axis = _ctx.at(axis_index).asScalar<int32_t>();
 
   auto tensors = _tensor_builder;
@@ -517,8 +513,6 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
     UNUSED_RELEASE(feature_size);
     assert(feature_size == batch_size * input_size);
 
-    tensors->dimCorrection(input_index, false);
-
     // for reshaping
     needs_reshape = true;
     reshape.dim(0) = batch_size; /* H */
@@ -580,17 +574,6 @@ void StageGenerator::visit(const model::operation::MulNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-    auto lhs_shape = _ctx.at(lhs_index).shape();
-    auto rhs_shape = _ctx.at(rhs_index).shape();
-
-    lhs_shape.extendRank(broadcast_rank);
-    rhs_shape.extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
@@ -602,11 +585,6 @@ void StageGenerator::visit(const model::operation::MulNode &node)
 
   // TODO: fix, tests are failing
   throw std::runtime_error("NYI");
-  // Nontrivial broadcasting isn't supported yet
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    throw std::runtime_error("NYI");
-  }
 
   Param param;
 
@@ -659,9 +637,6 @@ void StageGenerator::visit(const model::operation::ReshapeNode &node)
 
   auto tensors = _tensor_builder;
 
-  tensors->dimCorrection(input_index, false);
-  tensors->dimCorrection(output_index, false);
-
   returnStage([tensors, param](IExecutionBuilder &builder) {
     auto output_alloc = tensors->at(param.output_index).get();
     auto input_alloc = tensors->at(param.input_index).get();
@@ -751,14 +726,6 @@ void StageGenerator::visit(const model::operation::AddNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
 
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
-  {
-    const auto broadcast_rank =
-        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
-    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
-    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
diff --git a/runtimes/neurun/backend/cpu/StageGenerator.cc b/runtimes/neurun/backend/cpu/StageGenerator.cc
index c89c669..6612638 100644
--- a/runtimes/neurun/backend/cpu/StageGenerator.cc
+++ b/runtimes/neurun/backend/cpu/StageGenerator.cc
@@ -506,13 +506,6 @@ void StageGenerator::visit(const model::operation::AddNode &node)
   const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
   const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
 
-  // Broadcasting and quantization
-  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()) ||
-      _ctx.at(lhs_index).typeInfo().type() == model::DataType::QUANT8_ASYMM)
-  {
-    throw std::runtime_error{"NYI"};
-  }
-
   struct Param
   {
     model::OperandIndex ofm_index;
diff --git a/runtimes/neurun/core/src/compiler/ExecutorFactory.cc b/runtimes/neurun/core/src/compiler/ExecutorFactory.cc
index d0b1623..2b30ca8 100644
--- a/runtimes/neurun/core/src/compiler/ExecutorFactory.cc
+++ b/runtimes/neurun/core/src/compiler/ExecutorFactory.cc
@@ -28,6 +28,7 @@
 #include "OperationValidator.h"
 #include "SubTensorAnalyzer.h"
 #include "PlanBuilder.h"
+#include "backend/IShapeFixer.h"
 #include "ConstantInitializer.h"
 #include "cpp14/memory.h"
 
@@ -93,19 +94,17 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor(graph::Graph &graph)
 
   PlanBuilder plan_builder{*operand_context, *operation_sequence};
 
-  // Plan building
+  // Fix shapes
   linear->iterate([&](const linear::Element &element) {
     auto backend = element.lower_info->backend();
-
-    // Generate Stage
-    auto stage_gen = backend->stage_gen();
-    plan_builder.addStage(stage_gen->generate(*element.subgraph));
+    auto shape_fixer = backend->shape_fixer();
+    shape_fixer->fix(*element.subgraph);
   });
 
   auto tensor_builders = linear->planTensors();
 
   // TODO Add optimization passes
-  plan_builder.finalize(tensor_builders);
+  plan_builder.finalize(linear.get(), tensor_builders);
 
   ConstantInitializer{graph, *operand_context, *linear->getLowerInfo()}();
 
@@ -136,13 +135,12 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(graph::Graph &graph, bo
     subg.accept(subtensor_analyzer);
   });
 
+  // Fix shapes
   graph.subg_ctx().iterate(
       [&](const model::SubgraphIndex &subg_index, const model::Subgraph &subg) {
         auto backend = graph.getLowerInfo(subg_index)->backend();
-
-        // Generate Stage
-        auto stage_gen = backend->stage_gen();
-        stages[subg_index] = stage_gen->generate(subg);
+        auto shape_fixer = backend->shape_fixer();
+        shape_fixer->fix(subg);
       });
 
   backend::TensorBuilderSet tensor_builders;
@@ -193,9 +191,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(graph::Graph &graph, bo
     });
   }
 
-  // TODO Extract this to another class
-  //      IExecutionBuilder should be moved to `compiler/IExecutionBuilder.h` from
-  //      `backend/IStageGenerator.h`.
+  // TODO Extract this to another file
   class ExecutionBuilder : public IExecutionBuilder
   {
   public:
@@ -221,12 +217,15 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(graph::Graph &graph, bo
 
   ExecutionBuilder execution_builder;
 
-  for (auto &&itr : stages)
-  {
-    // TODO This approach is temporal. See declaration of `setNextIndex`.
-    execution_builder.setNextIndex(itr.first);
-    (*itr.second)(execution_builder);
-  }
+  // Generate and process stages
+  graph.subg_ctx().iterate(
+      [&](const model::SubgraphIndex &subg_index, const model::Subgraph &subg) {
+        auto backend = graph.getLowerInfo(subg_index)->backend();
+        auto stage_gen = backend->stage_gen();
+        // TODO This approach is temporal. See declaration of `setNextIndex`.
+        execution_builder.setNextIndex(subg_index);
+        (*stage_gen->generate(subg))(execution_builder);
+      });
 
   for (const auto &tensor_builder : tensor_builders)
   {
diff --git a/runtimes/neurun/core/src/compiler/PlanBuilder.cc b/runtimes/neurun/core/src/compiler/PlanBuilder.cc
index dbe8b27..691b41a 100644
--- a/runtimes/neurun/core/src/compiler/PlanBuilder.cc
+++ b/runtimes/neurun/core/src/compiler/PlanBuilder.cc
@@ -17,6 +17,8 @@
 #include "PlanBuilder.h"
 
 #include "backend/operand/IObject.h"
+#include "linear/Linear.h"
+#include "backend/Backend.h"
 
 namespace neurun
 {
@@ -28,7 +30,8 @@ void PlanBuilder::addStage(std::unique_ptr<backend::IStage> stage)
   _stages.emplace_back(std::move(stage));
 }
 
-void PlanBuilder::finalize(const backend::TensorBuilderSet &tensor_builders)
+void PlanBuilder::finalize(const linear::Linear *linear,
+                           const backend::TensorBuilderSet &tensor_builders)
 {
   // Prepare tensors
   for (auto &tensor_builder : tensor_builders)
@@ -42,13 +45,13 @@ void PlanBuilder::finalize(const backend::TensorBuilderSet &tensor_builders)
     });
   }
 
-  // Process Stage
+  // Generate and process stages
   ExecutionBuilder execution_builder{_operations};
-
-  for (const auto &stage : _stages)
-  {
-    (*stage)(execution_builder);
-  }
+  linear->iterate([&](const linear::Element &element) {
+    auto backend = element.lower_info->backend();
+    auto stage_gen = backend->stage_gen();
+    (*stage_gen->generate(*element.subgraph))(execution_builder);
+  });
 
   // Allocate Tensor Memory for cl_tensors
   for (auto &tensor_builder : tensor_builders)
@@ -57,5 +60,5 @@ void PlanBuilder::finalize(const backend::TensorBuilderSet &tensor_builders)
   }
 }
 
-} // namepsace compiler
+} // namespace compiler
 } // namespace neurun
diff --git a/runtimes/neurun/core/src/compiler/PlanBuilder.h b/runtimes/neurun/core/src/compiler/PlanBuilder.h
index 58fde49..7d62b89 100644
--- a/runtimes/neurun/core/src/compiler/PlanBuilder.h
+++ b/runtimes/neurun/core/src/compiler/PlanBuilder.h
@@ -23,6 +23,7 @@
 #include "backend/IStageGenerator.h"
 #include "backend/ITensorBuilder.h"
 #include "backend/IStage.h"
+#include "linear/Linear.h"
 
 namespace neurun
 {
@@ -61,7 +62,7 @@ public:
 
 public:
   // TODO Remove the argument `tensor_builders`
-  void finalize(const backend::TensorBuilderSet &tensor_builders);
+  void finalize(const linear::Linear *linear, const backend::TensorBuilderSet &tensor_builders);
 
 private:
   OperandContext &_operands;
-- 
2.7.4