From 80de8eeec580480e218713dbbc745377934b8ca9 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Ivan=20Vagin/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <ivan.vagin@samsung.com>
Date: Wed, 10 Jul 2019 08:22:15 +0300
Subject: [PATCH] [neurun] ACL CL kernel generation functionality moved into
 KernelGenerator (#5584)

ACL CL kernel generation functionality moved from StageGenerator into KernelGenerator

Signed-off-by: Ivan Vagin <ivan.vagin@samsung.com>
---
 runtimes/neurun/backend/acl_cl/KernelGenerator.cc | 2174 +++++++++++++-
 runtimes/neurun/backend/acl_cl/StageGenerator.cc  | 3273 +--------------------
 2 files changed, 2174 insertions(+), 3273 deletions(-)

diff --git a/runtimes/neurun/backend/acl_cl/KernelGenerator.cc b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc
index b125612..45ab283 100644
--- a/runtimes/neurun/backend/acl_cl/KernelGenerator.cc
+++ b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc
@@ -46,6 +46,102 @@ namespace acl_cl
 using ::neurun::backend::acl_common::asAclFunction;
 
 //
+// ActivationBuilder
+//
+class ActivationBuilder
+{
+public:
+  explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
+  {
+    // DO NOTHING
+  }
+
+private:
+  void appendReLU(::arm_compute::ICLTensor *ifm_alloc);
+  void appendReLU1(::arm_compute::ICLTensor *ifm_alloc);
+  void appendReLU6(::arm_compute::ICLTensor *ifm_alloc);
+
+public:
+  void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc);
+
+private:
+  IExecutionBuilder &_builder;
+};
+
+void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
+{
+  switch (code)
+  {
+    case model::Activation::NONE:
+    {
+      // DO NOTHING
+      break;
+    }
+    case model::Activation::RELU:
+    {
+      appendReLU(ifm_alloc);
+      break;
+    }
+    case model::Activation::RELU1:
+    {
+      appendReLU1(ifm_alloc);
+      break;
+    }
+    case model::Activation::RELU6:
+    {
+      appendReLU6(ifm_alloc);
+      break;
+    }
+    default:
+    {
+      throw std::runtime_error("Not supported, yet");
+    }
+  }
+}
+
+//
 // KernelGenerator
 //
 KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
@@ -55,115 +151,2083 @@ KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
   // DO NOTHING
 }
 
-void KernelGenerator::visit(const model::operation::CastNode & /*node*/) {}
+void KernelGenerator::visit(const model::operation::CastNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::CastNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::Conv2DNode &node)
+{
+  using model::operation::Conv2DNode;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(Conv2DNode::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(Conv2DNode::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(Conv2DNode::Input::BIAS)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto stride = node.param().stride;
+  const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+                                                      stride, ker_width, ker_height);
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+  const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+  const auto act_info = acl_common::asActivationLayerInfo(activation);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>();
+
+  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::DepthwiseConv2DNode &node)
+{
+  using model::operation::DepthwiseConv2DNode;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(DepthwiseConv2DNode::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(DepthwiseConv2DNode::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(DepthwiseConv2DNode::Input::BIAS)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  // Kernel format is [1, kernel_height, kernel_width, depth_out].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto stride = node.param().stride;
+  const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+                                                      stride, ker_width, ker_height);
+  const auto multiplier = node.param().multiplier;
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+  const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+  // TODO Use `activation` instead of `model::Activation::NONE`. See below.
+  const auto act_info = acl_common::asActivationLayerInfo(model::Activation::NONE);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+
+  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+                conv_info, multiplier, act_info);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+
+  // TODO Use fused activation instead of separate layer after switching to ACL version >= v19.05.
+  // Prior versions had a bug due to which the fused activation did not apply in some cases.
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::MaxPool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::MaxPool2DNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+
+  const auto kh = node.param().kh;
+  const auto kw = node.param().kw;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+  VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+  VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
+  VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
+  VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+  VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+                                       ::arm_compute::Size2D{kw, kh},
+                                       acl_common::asPadStrideInfo(padding, stride)};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append((std::move(acl_fn)));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::AvgPool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::AvgPool2DNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+
+  const auto kh = node.param().kh;
+  const auto kw = node.param().kw;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+  VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+  VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
+  VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
+  VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+  VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append((std::move(acl_fn)));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ConcatNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto axis_index{node.param().axis_index};
+
+  std::vector<model::OperandIndex> input_indexes;
+
+  for (const auto &input : node.getInputs())
+    input_indexes.emplace_back(input);
+
+  const auto axis = _ctx.at(axis_index).asScalar<int32_t>();
+
+  // If tensor allocator allocate as subtensor
+  bool canEliminate = true;
+  for (auto &ifm_ind : input_indexes)
+  {
+    if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
+    {
+      canEliminate = false;
+      break;
+    }
+  }
+  if (canEliminate)
+  {
+    // If concat eliminated, return a NOP IFunction
+    _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
+    return;
+  }
+
+  auto output_alloc = _tensor_builder->at(ofm_index).get();
+
+  std::vector<::neurun::backend::acl_cl::operand::ICLTensor *> input_allocs;
+  for (auto &ifm_ind : input_indexes)
+    input_allocs.emplace_back(_tensor_builder->at(ifm_ind).get());
+
+  auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
+
+  fn->configure(input_allocs, axis, output_alloc);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::FullyConnectedNode &node)
+{
+  using model::operation::FullyConnectedNode;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnectedNode::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnectedNode::Input::BIAS)};
+
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  // TODO Currently we are not handling where the case is that the input's rank is 3.
+  // The handling should be added in the future.
+  assert(input_rank != 3);
+
+  const auto output_size = _ctx.at(output_index).shape().dim(1);
+  UNUSED_RELEASE(output_size);
+  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+  const auto batch_size = _ctx.at(output_index).shape().dim(0);
+  const auto input_size = _ctx.at(weight_index).shape().dim(1);
+
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  neurun::model::Shape reshape(2);
+  if (input_rank == 4)
+  {
+    // TODO Support NCHW frontend
+    model::FeatureShape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
+    auto feature_size =
+        ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
+
+    UNUSED_RELEASE(feature_size);
+    assert(feature_size == batch_size * input_size);
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.dim(0) = batch_size; /* H */
+    reshape.dim(1) = input_size; /* W */
+  }
+
+  const auto activation = node.param().activation;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto weight_alloc = _tensor_builder->at(weight_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto acl_layout = output_alloc->handle()->info()->data_layout();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>();
+
+  fn->configure(input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(),
+                output_alloc->handle(), needs_reshape,
+                ::neurun::backend::acl_common::asTensorShape(
+                    reshape, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)));
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::MulNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+
+  l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+               arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ReduceSumNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReduceSumNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  std::vector<uint32_t> axes;
+  const auto axis_base = _ctx.at(axis_index).data().base();
+  const auto axis_size = _ctx.at(axis_index).shape().num_elements();
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+
+  // The axis's data must exist as constant values
+  assert(axis_base != nullptr);
+  for (size_t n = 0; n < axis_size; ++n)
+  {
+    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+    if (axis_value < 0)
+    {
+      axis_value += input_rank;
+    }
+    axes.emplace_back(
+        ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, axis_value).value());
+  }
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  std::set<uint32_t> axes_set;
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = input_alloc->handle()->info()->data_layout();
+  // CWHN -> WHCN
+  uint32_t permutation[4] = {2, 0, 1, 3};
+  for (size_t i = 0; i < axes.size(); ++i)
+  {
+    if (acl_layout == ::arm_compute::DataLayout::NCHW && input_rank == 4)
+    {
+      axes_set.insert(permutation[axes[i]]);
+    }
+    else
+    {
+      axes_set.insert(axes[i]);
+    }
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), axes_set,
+                ::arm_compute::ReduceOperation::SUM);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReshapeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  // NOTE This operation must not be changed the layout from frontend to backend
+  //      However, this runtime can be change the layout of this operation from NHWC to NCHW now
+  // TODO Change the layout of frontend and backend to be the same and layer to CLReshapeLayer
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericReshapeLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SqueezeNode &node)
+{
+  // Squeeze is identical to reshape except that it has an optional dimensions input.
+  // In addition, optional dims_index is ignored since output tensor already has squeezed shape
+  // by freezer and toco
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)};
+  const auto dims_index{node.param().dims};
+  (void)dims_index;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+  auto acl_fn = asAclFunction(std::move(fn));
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TanhNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::TanhNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SoftmaxNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SoftmaxNode::Input::INPUT)};
+
+  const auto beta = node.param().beta;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::StridedSliceNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::StridedSliceNode::Input::INPUT)};
+  const auto startData_index{node.param().startData_index};
+  const auto endData_index{node.param().endData_index};
+  const auto stridesData_index{node.param().stridesData_index};
+  const auto beginMask_index{node.param().beginMask_index};
+  const auto endMask_index{node.param().endMask_index};
+  const auto shrinkAxisMask_index{node.param().shrinkAxisMask_index};
+
+  // Set initializers for indices data such as order of inputData
+  int input_rank = _ctx.at(input_index).shape().rank();
+  std::vector<int32_t> starts;
+  std::vector<int32_t> ends;
+  std::vector<int32_t> strides;
+  starts.resize(input_rank, 0);
+  ends.resize(input_rank, 0);
+  strides.resize(input_rank, 0);
+  {
+    auto input_shape = _ctx.at(input_index).shape();
+    auto startData_base = _ctx.at(startData_index).data().base();
+    auto endData_base = _ctx.at(endData_index).data().base();
+    auto stridesData_base = _ctx.at(stridesData_index).data().base();
+    const int startData_size = _ctx.at(startData_index).shape().num_elements();
+    const int endData_size = _ctx.at(endData_index).shape().num_elements();
+    const int stridesData_size = _ctx.at(stridesData_index).shape().num_elements();
+
+    using neurun::model::DataType;
+
+    UNUSED_RELEASE(startData_size);
+    UNUSED_RELEASE(endData_size);
+    UNUSED_RELEASE(stridesData_size);
+
+    assert(_ctx.at(startData_index).typeInfo().type() == DataType::INT32);
+    assert(_ctx.at(endData_index).typeInfo().type() == DataType::INT32);
+    assert(_ctx.at(stridesData_index).typeInfo().type() == DataType::INT32);
+    assert(startData_size == input_rank);
+    assert(endData_size == input_rank);
+    assert(stridesData_size == input_rank);
+
+    assert(startData_base != nullptr);
+    for (int n = 0; n < input_rank; ++n)
+    {
+      auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
+
+      int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
+      starts[axis] = start_value;
+
+      int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
+      ends[axis] = end_value;
+
+      int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
+      strides[axis] = strides_value;
+    }
+  }
+
+  // Set mask bits such as order of inputData
+  const auto beginMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(beginMask_index).asScalar<int32_t>(), input_rank);
+  const auto endMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(endMask_index).asScalar<int32_t>(), input_rank);
+  const auto shrinkAxisMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(shrinkAxisMask_index).asScalar<int32_t>(), input_rank);
+
+  auto outputData_alloc = _tensor_builder->at(output_index).get();
+  auto inputData_alloc = _tensor_builder->at(input_index).get();
+
+  ::arm_compute::Coordinates starts_set;
+  ::arm_compute::Coordinates ends_set;
+  ::arm_compute::BiStrides strides_set;
+
+  for (size_t i = 0; i < starts.size(); ++i)
+  {
+    starts_set.set(i, starts[i]);
+    ends_set.set(i, ends[i]);
+    strides_set.set(i, strides[i]);
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
+
+  l->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+               strides_set, beginMask, endMask, shrinkAxisMask);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TransposeNode &node)
+{
+  const auto ofm_idx{node.getOutputs().at(0)};
+  const auto ifm_idx{node.getInputs().at(model::operation::TransposeNode::Input::INPUT)};
+  const auto perm{node.param().perm};
+
+  const auto rank = _ctx.at(ifm_idx).shape().rank();
+  std::vector<int32_t> pv;
+  const auto perm_base = _ctx.at(perm).data().base();
+  const int perm_size = _ctx.at(perm).shape().num_elements();
+
+  assert(perm_base != nullptr);
+  for (int32_t n = 0; n < perm_size; ++n)
+  {
+    int32_t perm_value = *(reinterpret_cast<const int32_t *>(perm_base) + n);
+    assert(static_cast<uint32_t>(perm_value) < rank);
+    pv.emplace_back(perm_value);
+  }
+
+  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  // Reversed
+  auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(rank, pv);
+  if (acl_layout == ::arm_compute::DataLayout::NCHW && rank == 4)
+  {
+    // CWHN -> WHCN
+    // C : 0 -> 2, W : 1 -> 0, H : 2 -> 1, N : 3 -> 3
+    ::arm_compute::PermutationVector cwhn_to_whcn_pv;
+    uint32_t axis[4] = {2, 0, 1, 3};
+    for (size_t i = 0; i < pv.size(); ++i)
+    {
+      cwhn_to_whcn_pv.set(axis[i], axis[backend_pv[i]]);
+    }
+    backend_pv = cwhn_to_whcn_pv;
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::AddNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
+
+  l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+               arm_compute::ConvertPolicy::SATURATE);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::SubNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
+
+  l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+               arm_compute::ConvertPolicy::SATURATE);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::Conv2DNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
 
-void KernelGenerator::visit(const model::operation::DepthwiseConv2DNode & /*node*/) {}
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::DivNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
+
+  l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ExpNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ExpNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
 
-void KernelGenerator::visit(const model::operation::MaxPool2DNode & /*node*/) {}
+  std::unique_ptr<::arm_compute::IFunction> fn;
 
-void KernelGenerator::visit(const model::operation::AvgPool2DNode & /*node*/) {}
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
 
-void KernelGenerator::visit(const model::operation::ConcatNode & /*node*/) {}
+  l->configure(input_alloc->handle(), output_alloc->handle());
 
-void KernelGenerator::visit(const model::operation::FullyConnectedNode & /*node*/) {}
+  fn = std::move(l);
 
-void KernelGenerator::visit(const model::operation::MulNode & /*node*/) {}
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::ReduceSumNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogisticNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::ReshapeNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
+}
 
-void KernelGenerator::visit(const model::operation::SqueezeNode & /*node*/) {}
+void KernelGenerator::visit(const model::operation::LogicalAndNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)};
 
-void KernelGenerator::visit(const model::operation::TanhNode & /*node*/) {}
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
 
-void KernelGenerator::visit(const model::operation::SoftmaxNode & /*node*/) {}
+  std::unique_ptr<::arm_compute::IFunction> fn;
 
-void KernelGenerator::visit(const model::operation::StridedSliceNode & /*node*/) {}
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
 
-void KernelGenerator::visit(const model::operation::TransposeNode & /*node*/) {}
+  l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+               ::arm_compute::BinaryLogicalOperation::AND);
 
-void KernelGenerator::visit(const model::operation::AddNode & /*node*/) {}
+  fn = std::move(l);
 
-void KernelGenerator::visit(const model::operation::SubNode & /*node*/) {}
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::DivNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
+}
 
-void KernelGenerator::visit(const model::operation::ExpNode & /*node*/) {}
+void KernelGenerator::visit(const model::operation::LSTMNode &node)
+{
+  // TODO Support dynamic rnn
+  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+  const auto scratch_buffer_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::CELL_STATE_OUT)};
+  const auto output_index{node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT)};
 
-void KernelGenerator::visit(const model::operation::LogisticNode & /*node*/) {}
+  const auto input_index{node.getInputs().at(model::operation::LSTMNode::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{node.getInputs().at(
+      model::operation::LSTMNode::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+  const auto input_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(model::operation::LSTMNode::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_WEIGHTS)}; // optional
+  const auto projection_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_BIAS)}; // optional
+  const auto output_state_in_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_STATE_IN)};
+  const auto cell_threshold = node.param().cell_threshold;
+  const auto projection_threshold = node.param().projection_threshold;
 
-void KernelGenerator::visit(const model::operation::LogicalAndNode & /*node*/) {}
+  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
 
-void KernelGenerator::visit(const model::operation::LSTMNode & /*node*/) {}
+  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+  // true: no CIFG
+  // false: CIFG
+  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
 
-void KernelGenerator::visit(const model::operation::ReduceMaxNode & /*node*/) {}
+  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
 
-void KernelGenerator::visit(const model::operation::ComparisonNode & /*node*/) {}
+  // NOTE Although the projection weights has data the projection bias may not have data.
+  bool has_projection_param = has_projection_weights;
 
-void KernelGenerator::visit(const model::operation::RSQRTNode & /*node*/) {}
+  const auto activation = node.param().activation;
+  const auto cell_clip = cell_threshold;
+  const auto projection_clip = projection_threshold;
+  assert(cell_clip >= 0.f && projection_clip >= 0.f);
 
-void KernelGenerator::visit(const model::operation::ReLUNode & /*node*/) {}
+  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+  auto output_alloc = _tensor_builder->at(output_index).get();
 
-void KernelGenerator::visit(const model::operation::ResizeBilinearNode & /*node*/) {}
+  auto input_alloc = _tensor_builder->at(input_index).get();
 
-void KernelGenerator::visit(const model::operation::ReLU1Node & /*node*/) {}
+  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+  auto recurrent_to_forget_weights_alloc =
+      _tensor_builder->at(recurrent_to_forget_weights_index).get();
+  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+  auto recurrent_to_output_weights_alloc =
+      _tensor_builder->at(recurrent_to_output_weights_index).get();
 
-void KernelGenerator::visit(const model::operation::ReLU6Node & /*node*/) {}
+  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
 
-void KernelGenerator::visit(const model::operation::RNNNode & /*node*/) {}
+  auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
 
-void KernelGenerator::visit(const model::operation::FloorNode & /*node*/) {}
+  std::unique_ptr<::arm_compute::IFunction> fn;
 
-void KernelGenerator::visit(const model::operation::SpaceToDepthNode & /*node*/) {}
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
 
-void KernelGenerator::visit(const model::operation::L2Pool2DNode & /*node*/) {}
+  ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
+  if (has_cifg_param)
+  {
+    auto input_to_input_weights_alloc =
+        _tensor_builder->at(input_to_input_weights_index).get(); // optional
+    auto recurrent_to_input_weights_alloc =
+        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+    auto cell_to_input_weights_handle =
+        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+                           : nullptr; // optional (non-cifg && peephole)
+    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+                                recurrent_to_input_weights_alloc->handle(),
+                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+  }
+  if (has_peephole_param)
+  {
+    auto cell_to_forget_weights_alloc =
+        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+    auto cell_to_output_weights_alloc =
+        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+                                    cell_to_output_weights_alloc->handle());
+  }
+  if (has_projection_param)
+  {
+    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
+                                      : nullptr; // optional
+    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+  }
 
-void KernelGenerator::visit(const model::operation::EmbeddingLookupNode & /*node*/) {}
+  l->configure(
+      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+      lstm_params, act_info, cell_clip, projection_clip);
 
-void KernelGenerator::visit(const model::operation::L2NormalizationNode & /*node*/) {}
+  fn = std::move(l);
 
-void KernelGenerator::visit(const model::operation::HashtableLookupNode & /*node*/) {}
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::PReLUNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
+}
 
-void KernelGenerator::visit(const model::operation::TransposeConvNode & /*node*/) {}
+void KernelGenerator::visit(const model::operation::ReduceMaxNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReduceMaxNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
 
-void KernelGenerator::visit(const model::operation::SQRTNode & /*node*/) {}
+  auto input_shape = _ctx.at(input_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
 
-void KernelGenerator::visit(const model::operation::LogicalOrNode & /*node*/) {}
+  std::vector<uint32_t> axis;
+  {
+    const auto ifm_rank = input_shape.rank();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axis.emplace_back(
+            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
 
-void KernelGenerator::visit(const model::operation::LogicalNotNode & /*node*/) {}
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.emplace_back(
+              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
+    }
+  }
 
-void KernelGenerator::visit(const model::operation::SquaredDifferenceNode & /*node*/) {}
+  const auto input_rank = input_shape.rank();
 
-void KernelGenerator::visit(const model::operation::TopKV2Node & /*node*/) {}
+  auto ofm_alloc = _tensor_builder->at(output_index).get();
+  auto ifm_alloc = _tensor_builder->at(input_index).get();
+  std::set<uint32_t> axes;
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  // CWHN -> WHCN
+  uint32_t permutation[4] = {2, 0, 1, 3};
+  for (size_t i = 0; i < axis.size(); ++i)
+  {
+    if (acl_layout == ::arm_compute::DataLayout::NCHW && input_rank == 4)
+    {
+      axes.insert(permutation[axis[i]]);
+    }
+    else
+    {
+      axes.insert(axis[i]);
+    }
+  }
 
-void KernelGenerator::visit(const model::operation::GatherNode & /*node*/) {}
+  std::unique_ptr<::arm_compute::IFunction> fn;
 
-void KernelGenerator::visit(const model::operation::NegNode & /*node*/) {}
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
 
-void KernelGenerator::visit(const model::operation::AbsNode & /*node*/) {}
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, arm_compute::ReduceOperation::MAX);
 
-void KernelGenerator::visit(const model::operation::ArgMaxNode & /*node*/) {}
+  fn = std::move(l);
 
-void KernelGenerator::visit(const model::operation::DequantizeNode & /*node*/) {}
+  auto acl_fn = asAclFunction(std::move(fn));
 
-void KernelGenerator::visit(const model::operation::MeanNode & /*node*/) {}
+  _execution_builder->append(std::move(acl_fn));
+}
 
-void KernelGenerator::visit(const model::operation::LocalResponseNormalizationNode & /*node*/) {}
+void KernelGenerator::visit(const model::operation::ComparisonNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)};
 
-void KernelGenerator::visit(const model::operation::DepthToSpaceNode & /*node*/) {}
+  const auto comparison_type = node.param().comparison_type;
 
-void KernelGenerator::visit(const model::operation::ReduceMinNode & /*node*/) {}
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
 
-void KernelGenerator::visit(const model::operation::SplitNode & /*node*/) {}
+  std::unique_ptr<::arm_compute::IFunction> fn;
 
-void KernelGenerator::visit(const model::operation::UnpackNode & /*node*/) {}
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
 
-void KernelGenerator::visit(const model::operation::PadNode & /*node*/) {}
+  l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+               (arm_compute::ComparisonOperation)comparison_type);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::RSQRTNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::ReLUNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReLUNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ResizeBilinearNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinearNode::Input::INPUT)};
+  const auto height_index{node.param().height_index};
+  const auto width_index{node.param().width_index};
+  (void)height_index;
+  (void)width_index;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+               ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+               ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReLU1Node &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReLU1Node::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReLU6Node &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReLU6Node::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::RNNNode &node)
+{
+  const auto output_index{node.getOutputs().at(model::operation::RNNNode::Output::OUTPUT)};
+  const auto hidden_state_out_index{
+      node.getOutputs().at(model::operation::RNNNode::Output::HIDDEN_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(model::operation::RNNNode::Input::INPUT)};
+  const auto weights_index{node.getInputs().at(model::operation::RNNNode::Input::WEIGHTS)};
+  const auto recurrent_weights_index{
+      node.getInputs().at(model::operation::RNNNode::Input::RECURRENT_WEIGHTS)};
+  const auto bias_index{node.getInputs().at(model::operation::RNNNode::Input::BIAS)};
+  const auto hidden_state_in_index{
+      node.getInputs().at(model::operation::RNNNode::Input::HIDDEN_STATE_IN)};
+
+  const auto activation = node.param().activation;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto weights_alloc = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
+
+  auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
+  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  _execution_builder->append(asAclFunction(std::move(copy_layer)));
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  auto rnn_layer = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>();
+  rnn_layer->configure(input_alloc->handle(), weights_alloc->handle(),
+                       recurrent_weights_alloc->handle(), bias_alloc->handle(),
+                       hidden_state_out_alloc->handle(), output_alloc->handle(), act_info);
+  fn = std::move(rnn_layer);
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::FloorNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::FloorNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SpaceToDepthNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)};
+  const auto block_size_index{node.param().block_size_index};
+
+  auto block_size = _ctx.at(block_size_index).asScalar<int32_t>();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::L2Pool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::L2Pool2DNode::Input::INPUT)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+
+  uint32_t kw = node.param().kw;
+  uint32_t kh = node.param().kh;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+      ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::EmbeddingLookupNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lookups_index{
+      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::LOOKUPS)};
+  const auto values_index{
+      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+  auto values_alloc = _tensor_builder->at(values_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
+
+  l->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::L2NormalizationNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::L2NormalizationNode::Input::INPUT)};
+
+  // {CL|Neon}L2Normalization performs the reduction only along dimension 0
+  // L2 Normalization always performs the reduction along the depth axis
+  // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
+  // choosing normalization parameters as below
+
+  int32_t radius = 2 * _ctx.at(ifm_index).shape().dim(3) + 1; // normSize = depth * 2 + 1
+  float alpha = 1.0f; // In the implementation to make alpha_ become 1
+  float beta = 0.5f;  // pow(reduction, -0.5) = 1 / sqrt(reduction)
+  float bias = 0.0f;  // Don't offset the reduction.
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+                                                               radius, alpha, beta, bias, false);
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::HashtableLookupNode &node)
+{
+  const auto output_index{
+      node.getOutputs().at(model::operation::HashtableLookupNode::Output::OUTPUT)};
+  const auto hits_index{node.getOutputs().at(model::operation::HashtableLookupNode::Output::HITS)};
+
+  const auto lookups_index{
+      node.getInputs().at(model::operation::HashtableLookupNode::Input::LOOKUPS)};
+  const auto keys_index{node.getInputs().at(model::operation::HashtableLookupNode::Input::KEYS)};
+  const auto values_index{
+      node.getInputs().at(model::operation::HashtableLookupNode::Input::VALUES)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto hits_alloc = _tensor_builder->at(hits_index).get();
+
+  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+  auto keys_alloc = _tensor_builder->at(keys_index).get();
+  auto values_alloc = _tensor_builder->at(values_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
+
+  l->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+               output_alloc->handle(), hits_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::PReLUNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)};
+  const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
+
+  l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TransposeConvNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto output_shape_index{
+      node.getInputs().at(model::operation::TransposeConvNode::Input::OUTPUT_SHAPE)};
+  const auto ker_index{node.getInputs().at(model::operation::TransposeConvNode::Input::KERNEL)};
+  const auto ifm_index{node.getInputs().at(model::operation::TransposeConvNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
+
+  const auto stride = node.param().stride;
+
+  assert((node.param().padding.type == model::PaddingType::SAME) ||
+         (node.param().padding.type == model::PaddingType::VALID));
+  auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
+                                                ker_shape.W, ker_shape.H);
+
+  uint32_t invalid_horizontal = 0;
+  uint32_t invalid_vertical = 0;
+  if (node.param().padding.type == model::PaddingType::VALID)
+  {
+    invalid_horizontal =
+        ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
+    invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+  }
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+
+  const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>();
+
+  l->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+               invalid_vertical, invalid_horizontal);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SQRTNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SQRTNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  l->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogicalOrNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
+
+  l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogicalNotNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::LogicalNotNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
+
+  l->configure(input_alloc->handle(), output_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SquaredDifferenceNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
+
+  l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TopKV2Node &node)
+{
+  const auto outputValues_index{
+      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_VALUES)};
+  const auto outputIndices_index{
+      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_INDICES)};
+
+  const auto inputData_index{node.getInputs().at(model::operation::TopKV2Node::Input::INPUT)};
+  const auto k_index{node.param().k_index};
+
+  // Currently, we only support the vector input.
+  assert(_ctx.at(inputData_index).shape().rank() == 1 ||
+         _ctx.at(inputData_index).shape().rank() == 2);
+
+  const auto k = _ctx.at(k_index).asScalar<int32_t>();
+
+  auto values_alloc = _tensor_builder->at(outputValues_index).get();
+  auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
+  auto input_alloc = _tensor_builder->at(inputData_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
+
+  l->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::GatherNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
+
+  const auto axis_index{node.param().axis_index};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  const auto axis_value = static_cast<int>(_ctx.at(axis_index).asScalar<int32_t>());
+  const int axis =
+      ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto indices_alloc = _tensor_builder->at(indices_index).get();
+  auto acl_layout = ofm_alloc->handle()->info()->data_layout();
+  UNUSED_RELEASE(acl_layout);
+
+  // NOTE The frontend layout and backend layout must be the same for this operation.
+  //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+  //      is not not efficient even if it works well. If so, it would be better to set the
+  //      layout of these backend tensors to the same layout.
+  //      There is also one thing we have to think about. This operation depends on the layout of
+  //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+  //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+  //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+  // TODO Remove this workaround
+  //      It is a workaround how to set the layout of these backend tensors to the layout of the
+  //      frontend when creating them
+  // TODO Supports front-end in NCHW
+  // TODO Change the layout of frontend and backend to be the same
+  // assert(::arm_compute::DataLayout::NHWC == acl_layout);
+  assert(acl_layout == ifm_alloc->handle()->info()->data_layout());
+  assert(acl_layout == indices_alloc->handle()->info()->data_layout());
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  // TODO Change to CLGather
+  auto l = nnfw::cpp14::make_unique<::arm_compute::misc::GenericGather>();
+
+  l->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::NegNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::NegNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::AbsNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::AbsNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  l->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ArgMaxNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
+
+  assert(_ctx.at(axis_index).isConstant());
+  // Axis dimension is always 1.
+  assert(axis_shape.rank() == 1);
+  assert((ifm_shape.rank() - 1) == ofm_shape.rank());
+
+  std::vector<uint32_t> l_axis;
+  const int axis_size = axis_shape.num_elements();
+  auto axis_base = _ctx.at(axis_index).data().base();
+  // TODO Should support axis size > 1.
+  assert(axis_size == 1);
+  // axis is tensor with 1 dimension - always a vector.
+  assert(axis_base != nullptr);
+
+  for (int32_t n = 0; n < axis_size; ++n)
+  {
+    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+    if (axis_value < 0)
+    {
+      axis_value += ifm_shape.rank();
+    }
+    l_axis.push_back(acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value());
+  }
+
+  const auto ifm_rank = ifm_shape.rank();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  if (acl_layout == ::arm_compute::DataLayout::NCHW && ifm_rank == 4)
+  {
+    // CWHN -> WHCN
+    uint32_t permutation[4] = {2, 0, 1, 3};
+    for (size_t i = 0; i < l_axis.size(); ++i)
+      l_axis[i] = permutation[l_axis[i]];
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), l_axis, ::arm_compute::ArgOperation::MAX);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::DequantizeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::DequantizeNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
+
+  l->configure(input_alloc->handle(), output_alloc->handle());
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::MeanNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::MeanNode::Input::INPUT)};
+
+  const auto axis_index{node.param().axis_index};
+  const auto keep_dims_index{node.param().keep_dims_index};
+  (void)keep_dims_index;
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  std::vector<uint32_t> axis;
+  {
+    const auto ifm_rank = ifm_shape.rank();
+    const auto axis_shape = _ctx.at(axis_index).shape();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        auto axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axis.emplace_back(
+            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.emplace_back(
+              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+    }
+  }
+
+  const auto ifm_rank = ifm_shape.rank();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::set<uint32_t> fixed_axis;
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  // CWHN -> WHCN
+  uint32_t permutation[4] = {2, 0, 1, 3};
+  for (auto a : axis)
+  {
+    if (acl_layout == ::arm_compute::DataLayout::NCHW && ifm_rank == 4)
+    {
+      fixed_axis.insert(permutation[a]);
+    }
+    else
+    {
+      fixed_axis.insert(a);
+    }
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  // NOTE CLReduceMean has a bug that does not support NHWC layout
+  //      CLReduceMean intermediate tensors are always NCHW layout
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), fixed_axis,
+               ::arm_compute::ReduceOperation::MEAN);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LocalResponseNormalizationNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{
+      node.getInputs().at(model::operation::LocalResponseNormalizationNode::Input::INPUT)};
+  const auto radius_index{node.param().radius_index};
+  const auto bias_index{node.param().bias_index};
+  const auto alpha_index{node.param().alpha_index};
+  const auto beta_index{node.param().beta_index};
+
+  auto radius = _ctx.at(radius_index).asScalar<int32_t>();
+  auto alpha = _ctx.at(alpha_index).asScalar<float>();
+  auto beta = _ctx.at(beta_index).asScalar<float>();
+  auto bias = _ctx.at(bias_index).asScalar<float>();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+      ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::DepthToSpaceNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::DepthToSpaceNode::Input::INPUT)};
+  const auto block_size_index{node.param().block_size_index};
+
+  auto block_size = _ctx.at(block_size_index).asScalar<int32_t>();
+  assert(block_size > 0);
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
+
+  l->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReduceMinNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReduceMinNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
+
+  std::vector<uint32_t> axis;
+  {
+    const auto ifm_rank = ifm_shape.rank();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        auto axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axis.emplace_back(
+            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axis.emplace_back(
+              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
+    }
+  }
+
+  const auto ifm_rank = ifm_shape.rank();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::set<uint32_t> fixed_axis;
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+  // CWHN -> WHCN
+  uint32_t permutation[4] = {2, 0, 1, 3};
+  for (auto a : axis)
+  {
+    if (acl_layout == ::arm_compute::DataLayout::NCHW && ifm_rank == 4)
+    {
+      fixed_axis.insert(permutation[a]);
+    }
+    else
+    {
+      fixed_axis.insert(a);
+    }
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  l->configure(ifm_alloc->handle(), ofm_alloc->handle(), fixed_axis,
+               ::arm_compute::ReduceOperation::MIN);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SplitNode &node)
+{
+  const auto ifm_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+  const auto num_of_splits_index{node.param().num_of_splits_index};
+
+  assert(_ctx.at(num_of_splits_index).asScalar<unsigned int>() == node.getOutputs().size());
+
+  const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
+  if (axis < 0)
+    axis += ifm_rank;
+  axis = acl_common::ToARMComputeAxis(ifm_rank, axis).value();
+
+  std::vector<model::OperandIndex> output_indexes;
+  for (const auto &output : node.getOutputs())
+    output_indexes.emplace_back(output);
+
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ICLTensor *> output_allocs;
+  for (const auto &ofm_ind : output_indexes)
+    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+
+  auto fixed_axis = axis;
+  auto acl_layout = ifm_alloc->handle()->info()->data_layout();
+
+  if (acl_layout == ::arm_compute::DataLayout::NCHW && ifm_rank == 4)
+  {
+    // CWHN -> WHCN
+    uint32_t permutation[4] = {2, 0, 1, 3};
+    fixed_axis = permutation[fixed_axis];
+  }
+
+  std::unique_ptr<::arm_compute::IFunction> fn;
+
+  // TODO Support NCHW frontend
+  // TODO Change the layout of frontend and backend to be the same
+  auto l = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
+
+  l->configure(ifm_alloc->handle(), output_allocs, fixed_axis);
+
+  fn = std::move(l);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::UnpackNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)};
+  auto axis{node.param().axis};
+
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+
+  if (axis < 0)
+    axis += input_rank;
+  axis = acl_common::ToARMComputeAxis(input_rank, axis).value();
+
+  std::vector<model::OperandIndex> output_indexes;
+  for (const auto &output_index : node.getOutputs())
+    output_indexes.emplace_back(output_index);
+
+  auto input = _tensor_builder->at(input_index).get()->handle();
+  std::vector<arm_compute::ICLTensor *> outputs;
+  for (const auto &output_index : output_indexes)
+    outputs.emplace_back(_tensor_builder->at(output_index)->handle());
+
+  int fixed_axis = axis;
+  if (input->info()->num_dimensions() == 4 &&
+      input->info()->data_layout() == ::arm_compute::DataLayout::NCHW)
+  {
+    // CWHN -> WHCN
+    const int permutation[4] = {2, 0, 1, 3};
+    fixed_axis = permutation[fixed_axis];
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
+
+  fn->configure(input, outputs, fixed_axis);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::PadNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)};
+  const auto pad_index{node.getInputs().at(model::operation::PadNode::Input::PAD)};
+  const auto output_index{node.getOutputs().at(0)};
+  assert(_ctx.at(pad_index).isConstant());
+
+  auto rank = _ctx.at(pad_index).shape().dim(0);
+  auto pad_base = _ctx.at(pad_index).data().base();
+  ::arm_compute::PaddingList padding_list;
+  padding_list.resize(rank);
+  for (int32_t n = 0; n < rank; ++n)
+  {
+    const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
+    auto axis = acl_common::ToARMComputeAxis(rank, n).value();
+
+    padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
+  }
+
+  auto input_type = _ctx.at(input_index).typeInfo();
+  auto data_type = acl_common::asDataType(input_type.type());
+  auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
+  const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
+
+  auto input = _tensor_builder->at(input_index).get()->handle();
+  auto output = _tensor_builder->at(output_index).get()->handle();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
+  fn->configure(input, output, padding_list, pixel_value);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
 
 } // namespace acl_cl
 } // namespace backend
diff --git a/runtimes/neurun/backend/acl_cl/StageGenerator.cc b/runtimes/neurun/backend/acl_cl/StageGenerator.cc
index eb245a9..e9da59d 100644
--- a/runtimes/neurun/backend/acl_cl/StageGenerator.cc
+++ b/runtimes/neurun/backend/acl_cl/StageGenerator.cc
@@ -46,102 +46,6 @@ namespace acl_cl
 using ::neurun::backend::acl_common::asAclFunction;
 
 //
-// ActivationBuilder
-//
-class ActivationBuilder
-{
-public:
-  ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
-  {
-    // DO NOTHING
-  }
-
-private:
-  void appendReLU(::arm_compute::ICLTensor *tensor);
-  void appendReLU1(::arm_compute::ICLTensor *tensor);
-  void appendReLU6(::arm_compute::ICLTensor *tensor);
-
-public:
-  void append(model::Activation code, ::arm_compute::ICLTensor *tensor);
-
-private:
-  IExecutionBuilder &_builder;
-};
-
-void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
-{
-  const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
-
-  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-  fn->configure(ifm_alloc, nullptr, act_info);
-
-  auto acl_fn = asAclFunction(std::move(fn));
-
-  _builder.append(std::move(acl_fn));
-}
-
-void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
-{
-  const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
-
-  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-  fn->configure(ifm_alloc, nullptr, act_info);
-
-  auto acl_fn = asAclFunction(std::move(fn));
-
-  _builder.append(std::move(acl_fn));
-}
-
-void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
-{
-  const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
-
-  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-  fn->configure(ifm_alloc, nullptr, act_info);
-
-  auto acl_fn = asAclFunction(std::move(fn));
-
-  _builder.append(std::move(acl_fn));
-}
-
-void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
-{
-  switch (code)
-  {
-    case model::Activation::NONE:
-    {
-      // DO NOTHING
-      break;
-    }
-    case model::Activation::RELU:
-    {
-      appendReLU(ifm_alloc);
-      break;
-    }
-    case model::Activation::RELU1:
-    {
-      appendReLU1(ifm_alloc);
-      break;
-    }
-    case model::Activation::RELU6:
-    {
-      appendReLU6(ifm_alloc);
-      break;
-    }
-    default:
-    {
-      throw std::runtime_error("Not supported, yet");
-    }
-  }
-}
-
-//
 // StageGenerator
 //
 StageGenerator::StageGenerator(const neurun::model::Operands &ctx,
@@ -151,3182 +55,115 @@ StageGenerator::StageGenerator(const neurun::model::Operands &ctx,
   // DO NOTHING
 }
 
-void StageGenerator::visit(const model::operation::CastNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::CastNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::Conv2DNode &node)
-{
-  using model::operation::Conv2DNode;
-
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(Conv2DNode::Input::INPUT)};
-  const auto ker_index{node.getInputs().at(Conv2DNode::Input::KERNEL)};
-  const auto bias_index{node.getInputs().at(Conv2DNode::Input::BIAS)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
-  const auto &ker_shape = _ctx.at(ker_index).shape();
-  const auto ker_height = ker_shape.dim(1);
-  const auto ker_width = ker_shape.dim(2);
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    model::OperandIndex ker_index;
-    model::OperandIndex bias_index;
-
-    model::ExplicitPadding padding;
-    model::Stride stride;
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.ker_index = ker_index;
-  param.bias_index = bias_index;
-
-  param.stride = node.param().stride;
-  param.padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
-                                                 param.stride, ker_width, ker_height);
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto ker_alloc = tensors->at(param.ker_index).get();
-    auto bias_alloc = tensors->at(param.bias_index).get();
-
-    const auto conv_info = acl_common::asPadStrideInfo(param.padding, param.stride);
-    const auto act_info = acl_common::asActivationLayerInfo(param.activation);
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>();
-
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, ::arm_compute::WeightsInfo(),
-                  ::arm_compute::Size2D(1U, 1U), act_info);
-
-    builder.append(asAclFunction(std::move(fn)));
-  });
-}
-
-void StageGenerator::visit(const model::operation::DepthwiseConv2DNode &node)
-{
-  using model::operation::DepthwiseConv2DNode;
-
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(DepthwiseConv2DNode::Input::INPUT)};
-  const auto ker_index{node.getInputs().at(DepthwiseConv2DNode::Input::KERNEL)};
-  const auto bias_index{node.getInputs().at(DepthwiseConv2DNode::Input::BIAS)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-  // Kernel format is [1, kernel_height, kernel_width, depth_out].
-  const auto &ker_shape = _ctx.at(ker_index).shape();
-  const auto ker_height = ker_shape.dim(1);
-  const auto ker_width = ker_shape.dim(2);
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    model::OperandIndex ker_index;
-    model::OperandIndex bias_index;
-
-    model::ExplicitPadding padding;
-    neurun::model::Stride stride;
-    int multiplier;
-
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.ker_index = ker_index;
-  param.bias_index = bias_index;
-
-  param.stride = node.param().stride;
-  param.padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
-                                                 param.stride, ker_width, ker_height);
-  param.multiplier = node.param().multiplier;
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto ker_alloc = tensors->at(param.ker_index).get();
-    auto bias_alloc = tensors->at(param.bias_index).get();
-
-    const auto conv_info = acl_common::asPadStrideInfo(param.padding, param.stride);
-    // TODO Use `param.activation` instead of `model::Activation::NONE`. See below.
-    const auto act_info = acl_common::asActivationLayerInfo(model::Activation::NONE);
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
-
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, param.multiplier, act_info);
-
-    builder.append(asAclFunction(std::move(fn)));
-
-    // TODO Use fused activation instead of separate layer after switching to ACL version >= v19.05.
-    // Prior versions had a bug due to which the fused activation did not apply in some cases.
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::MaxPool2DNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::MaxPool2DNode::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    uint32_t kw;
-    uint32_t kh;
-
-    model::ExplicitPadding padding;
-    model::Stride stride;
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  param.kh = node.param().kh;
-  param.kw = node.param().kw;
-  param.stride = node.param().stride;
-  param.padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
-                                                 param.stride, param.kw, param.kh);
-  param.activation = node.param().activation;
-
-  VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "KER_H: " << param.kh << std::endl;
-  VERBOSE(MaxPool2D) << "KER_W: " << param.kw << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_H: " << param.stride.vertical << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_W: " << param.stride.horizontal << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(T): " << param.padding.top << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(B): " << param.padding.bottom << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(L): " << param.padding.left << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(R): " << param.padding.right << std::endl;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
-                                         ::arm_compute::Size2D{param.kw, param.kh},
-                                         acl_common::asPadStrideInfo(param.padding, param.stride)};
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
-
-    fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append((std::move(acl_fn)));
-
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::AvgPool2DNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::AvgPool2DNode::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    uint32_t kw;
-    uint32_t kh;
-
-    model::ExplicitPadding padding;
-    model::Stride stride;
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  param.kh = node.param().kh;
-  param.kw = node.param().kw;
-  param.stride = node.param().stride;
-  param.padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
-                                                 param.stride, param.kw, param.kh);
-  param.activation = node.param().activation;
-
-  VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "KER_H: " << param.kh << std::endl;
-  VERBOSE(AvgPool2D) << "KER_W: " << param.kw << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_H: " << param.stride.vertical << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_W: " << param.stride.horizontal << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(T): " << param.padding.top << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(B): " << param.padding.bottom << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(L): " << param.padding.left << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(R): " << param.padding.right << std::endl;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    ::arm_compute::PoolingLayerInfo info{
-        ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{param.kw, param.kh},
-        acl_common::asPadStrideInfo(param.padding, param.stride), true /* exclude_padding */};
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
-
-    fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append((std::move(acl_fn)));
-
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::ConcatNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto axis_index{node.param().axis_index};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    std::vector<model::OperandIndex> input_indexes;
-
-    int32_t axis;
-  };
-
-  Param param;
-
-  param.output_index = ofm_index;
-  for (const auto &e : node.getInputs())
-    param.input_indexes.emplace_back(e);
-  param.axis = _ctx.at(axis_index).asScalar<int32_t>();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    // If tensor allocator allocate as subtensor
-    bool canEliminate = true;
-    for (auto ifm_ind : param.input_indexes)
-    {
-      if (!tensors->isSubTensorOf(param.output_index, ifm_ind))
-      {
-        canEliminate = false;
-        break;
-      }
-    }
-    if (canEliminate)
-    {
-      // If concat eliminated, return a NOP IFunction
-      builder.append(nnfw::cpp14::make_unique<exec::NopFunction>());
-      return;
-    }
-
-    auto output_alloc = tensors->at(param.output_index).get();
-
-    std::vector<::neurun::backend::acl_cl::operand::ICLTensor *> input_allocs;
-    for (auto ifm_ind : param.input_indexes)
-    {
-      input_allocs.emplace_back(tensors->at(ifm_ind).get());
-    }
-
-    auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
-
-    fn->configure(input_allocs, param.axis, output_alloc);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
-{
-  using model::operation::FullyConnectedNode;
-
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)};
-  const auto weight_index{node.getInputs().at(FullyConnectedNode::Input::WEIGHT)};
-  const auto bias_index{node.getInputs().at(FullyConnectedNode::Input::BIAS)};
-
-  auto tensors = _tensor_builder;
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-  // TODO Currently we are not handling where the case is that the input's rank is 3.
-  // The handling should be added in the future.
-  assert(input_rank != 3);
-
-  const auto output_size = _ctx.at(output_index).shape().dim(1);
-  UNUSED_RELEASE(output_size);
-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
-  const auto batch_size = _ctx.at(output_index).shape().dim(0);
-  const auto input_size = _ctx.at(weight_index).shape().dim(1);
-
-  // Check for reshaping input's shape into rank-2
-  bool needs_reshape = false;
-  neurun::model::Shape reshape(2);
-  if (input_rank == 4)
-  {
-    // TODO Support NCHW frontend
-    model::FeatureShape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
-    auto feature_size =
-        ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
-
-    UNUSED_RELEASE(feature_size);
-    assert(feature_size == batch_size * input_size);
-
-    // for reshaping
-    needs_reshape = true;
-    reshape.dim(0) = batch_size; /* H */
-    reshape.dim(1) = input_size; /* W */
-  }
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-
-    model::OperandIndex input_index;
-    model::OperandIndex weight_index;
-    model::OperandIndex bias_index;
-
-    model::Activation activation;
-
-    bool needs_reshape;
-    neurun::model::Shape reshape;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-  param.weight_index = weight_index;
-  param.bias_index = bias_index;
-
-  param.activation = node.param().activation;
-
-  param.needs_reshape = needs_reshape;
-  param.reshape = reshape;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-    auto weight_alloc = tensors->at(param.weight_index).get();
-    auto bias_alloc = tensors->at(param.bias_index).get();
-    auto acl_layout = output_alloc->handle()->info()->data_layout();
-
-    auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>();
-
-    fn->configure(input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(),
-                  output_alloc->handle(), param.needs_reshape,
-                  ::neurun::backend::acl_common::asTensorShape(
-                      param.reshape, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)));
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-
-    ActivationBuilder{builder}.append(param.activation, output_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::MulNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
-  const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex lhs_index;
-    model::OperandIndex rhs_index;
-
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = output_index;
-  param.lhs_index = lhs_index;
-  param.rhs_index = rhs_index;
-
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto lhs_alloc = tensors->at(param.lhs_index).get();
-    auto rhs_alloc = tensors->at(param.rhs_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
-
-    l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
-                 arm_compute::ConvertPolicy::SATURATE,
-                 arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReduceSumNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::ReduceSumNode::Input::INPUT)};
-  const auto axis_index{node.param().axis_index};
-
-  std::vector<uint32_t> axes;
-  const auto axis_base = _ctx.at(axis_index).data().base();
-  const auto axis_size = _ctx.at(axis_index).shape().num_elements();
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  // The axis's data must exist as constant values
-  assert(axis_base != nullptr);
-  for (size_t n = 0; n < axis_size; ++n)
-  {
-    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
-    if (axis_value < 0)
-    {
-      axis_value += input_rank;
-    }
-    axes.emplace_back(
-        ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, axis_value).value());
-  }
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-
-    std::vector<uint32_t> axes;
-    uint32_t input_rank;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  param.axes = std::move(axes);
-  param.input_rank = input_rank;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](compiler::IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-    std::set<uint32_t> axes;
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = input_alloc->handle()->info()->data_layout();
-    // CWHN -> WHCN
-    uint32_t permutation[4] = {2, 0, 1, 3};
-    for (size_t i = 0; i < param.axes.size(); ++i)
-    {
-      if (acl_layout == ::arm_compute::DataLayout::NCHW && param.input_rank == 4)
-      {
-        axes.insert(permutation[param.axes[i]]);
-      }
-      else
-      {
-        axes.insert(param.axes[i]);
-      }
-    }
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
-
-    fn->configure(input_alloc->handle(), output_alloc->handle(), axes,
-                  ::arm_compute::ReduceOperation::SUM);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReshapeNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    // NOTE This operation must not be changed the layout from frontend to backend
-    //      However, this runtime can be change the layout of this operation from NHWC to NCHW now
-    // TODO Change the layout of frontend and backend to be the same and layer to CLReshapeLayer
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericReshapeLayer>();
-
-    fn->configure(input_alloc->handle(), output_alloc->handle());
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SqueezeNode &node)
-{
-  // Squeeze is identical to reshape except that it has an optional dimensions input.
-  // In addition, optional dims_index is ignored since output tensor already has squeezed shape
-  // by freezer and toco
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)};
-  const auto dims_index{node.param().dims};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param{output_index, input_index};
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-    auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
-    fn->configure(input_alloc->handle(), output_alloc->handle());
-    auto acl_fn = asAclFunction(std::move(fn));
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::TanhNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::TanhNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
-
-    fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SoftmaxNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::SoftmaxNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-    float beta;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-  param.beta = node.param().beta;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>();
-
-    fn->configure(input_alloc->handle(), output_alloc->handle(), param.beta);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::StridedSliceNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::StridedSliceNode::Input::INPUT)};
-  const auto startData_index{node.param().startData_index};
-  const auto endData_index{node.param().endData_index};
-  const auto stridesData_index{node.param().stridesData_index};
-  const auto beginMask_index{node.param().beginMask_index};
-  const auto endMask_index{node.param().endMask_index};
-  const auto shrinkAxisMask_index{node.param().shrinkAxisMask_index};
-
-  // Set initializers for indices data such as order of inputData
-  int input_rank = _ctx.at(input_index).shape().rank();
-  std::vector<int32_t> starts;
-  std::vector<int32_t> ends;
-  std::vector<int32_t> strides;
-  starts.resize(input_rank, 0);
-  ends.resize(input_rank, 0);
-  strides.resize(input_rank, 0);
-  {
-    auto input_shape = _ctx.at(input_index).shape();
-    auto startData_base = _ctx.at(startData_index).data().base();
-    auto endData_base = _ctx.at(endData_index).data().base();
-    auto stridesData_base = _ctx.at(stridesData_index).data().base();
-    const int startData_size = _ctx.at(startData_index).shape().num_elements();
-    const int endData_size = _ctx.at(endData_index).shape().num_elements();
-    const int stridesData_size = _ctx.at(stridesData_index).shape().num_elements();
-
-    using neurun::model::DataType;
-
-    UNUSED_RELEASE(startData_size);
-    UNUSED_RELEASE(endData_size);
-    UNUSED_RELEASE(stridesData_size);
-
-    assert(_ctx.at(startData_index).typeInfo().type() == DataType::INT32);
-    assert(_ctx.at(endData_index).typeInfo().type() == DataType::INT32);
-    assert(_ctx.at(stridesData_index).typeInfo().type() == DataType::INT32);
-    assert(startData_size == input_rank);
-    assert(endData_size == input_rank);
-    assert(stridesData_size == input_rank);
-
-    assert(startData_base != nullptr);
-    for (int n = 0; n < input_rank; ++n)
-    {
-      auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
-
-      int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
-      starts[axis] = start_value;
-
-      int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
-      ends[axis] = end_value;
+void StageGenerator::visit(const model::operation::CastNode &) {}
 
-      int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
-      strides[axis] = strides_value;
-    }
-  }
+void StageGenerator::visit(const model::operation::Conv2DNode &) {}
 
-  struct Param
-  {
-    model::OperandIndex outputData_index;
-    model::OperandIndex inputData_index;
+void StageGenerator::visit(const model::operation::DepthwiseConv2DNode &) {}
 
-    std::vector<int32_t> starts;
-    std::vector<int32_t> ends;
-    std::vector<int32_t> strides;
+void StageGenerator::visit(const model::operation::MaxPool2DNode &) {}
 
-    int32_t beginMask;
-    int32_t endMask;
-    int32_t shrinkAxisMask;
-  };
+void StageGenerator::visit(const model::operation::AvgPool2DNode &) {}
 
-  Param param;
-  param.outputData_index = output_index;
-  param.inputData_index = input_index;
+void StageGenerator::visit(const model::operation::ConcatNode &) {}
 
-  param.starts = starts;
-  param.ends = ends;
-  param.strides = strides;
+void StageGenerator::visit(const model::operation::FullyConnectedNode &) {}
 
-  // Set mask bits such as order of inputData
-  param.beginMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
-      _ctx.at(beginMask_index).asScalar<int32_t>(), input_rank);
-  param.endMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
-      _ctx.at(endMask_index).asScalar<int32_t>(), input_rank);
-  param.shrinkAxisMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
-      _ctx.at(shrinkAxisMask_index).asScalar<int32_t>(), input_rank);
+void StageGenerator::visit(const model::operation::MulNode &) {}
 
-  auto tensors = _tensor_builder;
+void StageGenerator::visit(const model::operation::ReduceSumNode &) {}
 
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto outputData_alloc = tensors->at(param.outputData_index).get();
-    auto inputData_alloc = tensors->at(param.inputData_index).get();
+void StageGenerator::visit(const model::operation::ReshapeNode &) {}
 
-    ::arm_compute::Coordinates starts;
-    ::arm_compute::Coordinates ends;
-    ::arm_compute::BiStrides strides;
+void StageGenerator::visit(const model::operation::SqueezeNode &) {}
 
-    for (size_t i = 0; i < param.starts.size(); ++i)
-    {
-      starts.set(i, param.starts[i]);
-      ends.set(i, param.ends[i]);
-      strides.set(i, param.strides[i]);
-    }
+void StageGenerator::visit(const model::operation::TanhNode &) {}
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
+void StageGenerator::visit(const model::operation::SoftmaxNode &) {}
 
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
+void StageGenerator::visit(const model::operation::StridedSliceNode &) {}
 
-    l->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts, ends, strides,
-                 param.beginMask, param.endMask, param.shrinkAxisMask);
+void StageGenerator::visit(const model::operation::TransposeNode &) {}
 
-    fn = std::move(l);
+void StageGenerator::visit(const model::operation::AddNode &) {}
 
-    auto acl_fn = asAclFunction(std::move(fn));
+void StageGenerator::visit(const model::operation::SubNode &) {}
 
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::TransposeNode &node)
-{
-  const auto ofm_idx{node.getOutputs().at(0)};
-  const auto ifm_idx{node.getInputs().at(model::operation::TransposeNode::Input::INPUT)};
-  const auto perm{node.param().perm};
-
-  const auto rank = _ctx.at(ifm_idx).shape().rank();
-  std::vector<int32_t> pv;
-  const auto perm_base = _ctx.at(perm).data().base();
-  const int perm_size = _ctx.at(perm).shape().num_elements();
-
-  assert(perm_base != nullptr);
-  for (int32_t n = 0; n < perm_size; ++n)
-  {
-    int32_t perm_value = *(reinterpret_cast<const int32_t *>(perm_base) + n);
-    assert(static_cast<uint32_t>(perm_value) < rank);
-    pv.emplace_back(perm_value);
-  }
-
-  struct Param
-  {
-    model::OperandIndex ifm_idx;
-    model::OperandIndex ofm_idx;
-
-    std::vector<int32_t> pv;
-    uint32_t rank;
-  };
-
-  Param param;
+void StageGenerator::visit(const model::operation::DivNode &) {}
 
-  param.ifm_idx = ifm_idx;
-  param.ofm_idx = ofm_idx;
-  param.pv = pv;
-  param.rank = rank;
+void StageGenerator::visit(const model::operation::ExpNode &) {}
 
-  auto tensors = _tensor_builder;
+void StageGenerator::visit(const model::operation::LogisticNode &) {}
 
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_idx).get();
-    auto ifm_alloc = tensors->at(param.ifm_idx).get();
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-    // Reversed
-    auto pv = ::neurun::backend::acl_common::getARMComputePermutationVector(param.rank, param.pv);
-    if (acl_layout == ::arm_compute::DataLayout::NCHW && param.rank == 4)
-    {
-      // CWHN -> WHCN
-      // C : 0 -> 2, W : 1 -> 0, H : 2 -> 1, N : 3 -> 3
-      ::arm_compute::PermutationVector cwhn_to_whcn_pv;
-      uint32_t axis[4] = {2, 0, 1, 3};
-      for (size_t i = 0; i < param.pv.size(); ++i)
-      {
-        cwhn_to_whcn_pv.set(axis[i], axis[pv[i]]);
-      }
-      pv = cwhn_to_whcn_pv;
-    }
+void StageGenerator::visit(const model::operation::LogicalAndNode &) {}
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
+void StageGenerator::visit(const model::operation::LSTMNode &) {}
 
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
+void StageGenerator::visit(const model::operation::ReduceMaxNode &) {}
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+void StageGenerator::visit(const model::operation::ComparisonNode &) {}
 
-    fn = std::move(l);
+void StageGenerator::visit(const model::operation::RSQRTNode &) {}
 
-    auto acl_fn = asAclFunction(std::move(fn));
+void StageGenerator::visit(const model::operation::ReLUNode &) {}
 
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::AddNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
-  const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex lhs_index;
-    model::OperandIndex rhs_index;
+void StageGenerator::visit(const model::operation::ResizeBilinearNode &) {}
 
-    model::Activation activation;
-  };
+void StageGenerator::visit(const model::operation::ReLU1Node &) {}
 
-  Param param;
+void StageGenerator::visit(const model::operation::ReLU6Node &) {}
 
-  param.ofm_index = output_index;
-  param.lhs_index = lhs_index;
-  param.rhs_index = rhs_index;
+void StageGenerator::visit(const model::operation::RNNNode &) {}
 
-  param.activation = node.param().activation;
+void StageGenerator::visit(const model::operation::FloorNode &) {}
 
-  auto tensors = _tensor_builder;
+void StageGenerator::visit(const model::operation::SpaceToDepthNode &) {}
 
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto lhs_alloc = tensors->at(param.lhs_index).get();
-    auto rhs_alloc = tensors->at(param.rhs_index).get();
+void StageGenerator::visit(const model::operation::L2Pool2DNode &) {}
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
+void StageGenerator::visit(const model::operation::EmbeddingLookupNode &) {}
 
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
+void StageGenerator::visit(const model::operation::L2NormalizationNode &) {}
 
-    l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
-                 arm_compute::ConvertPolicy::SATURATE);
+void StageGenerator::visit(const model::operation::HashtableLookupNode &) {}
 
-    fn = std::move(l);
+void StageGenerator::visit(const model::operation::PReLUNode &) {}
 
-    auto acl_fn = asAclFunction(std::move(fn));
+void StageGenerator::visit(const model::operation::TransposeConvNode &) {}
 
-    builder.append(std::move(acl_fn));
+void StageGenerator::visit(const model::operation::SQRTNode &) {}
 
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::SubNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)};
-  const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)};
+void StageGenerator::visit(const model::operation::LogicalOrNode &) {}
 
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex lhs_index;
-    model::OperandIndex rhs_index;
+void StageGenerator::visit(const model::operation::LogicalNotNode &) {}
 
-    model::Activation activation;
-  };
+void StageGenerator::visit(const model::operation::SquaredDifferenceNode &) {}
 
-  Param param;
+void StageGenerator::visit(const model::operation::TopKV2Node &) {}
 
-  param.ofm_index = output_index;
-  param.lhs_index = lhs_index;
-  param.rhs_index = rhs_index;
+void StageGenerator::visit(const model::operation::GatherNode &) {}
 
-  param.activation = node.param().activation;
+void StageGenerator::visit(const model::operation::NegNode &) {}
 
-  auto tensors = _tensor_builder;
+void StageGenerator::visit(const model::operation::AbsNode &) {}
 
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto lhs_alloc = tensors->at(param.lhs_index).get();
-    auto rhs_alloc = tensors->at(param.rhs_index).get();
+void StageGenerator::visit(const model::operation::ArgMaxNode &) {}
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
+void StageGenerator::visit(const model::operation::DequantizeNode &) {}
 
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
+void StageGenerator::visit(const model::operation::MeanNode &) {}
 
-    l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
-                 arm_compute::ConvertPolicy::SATURATE);
+void StageGenerator::visit(const model::operation::LocalResponseNormalizationNode &) {}
 
-    fn = std::move(l);
+void StageGenerator::visit(const model::operation::DepthToSpaceNode &) {}
 
-    auto acl_fn = asAclFunction(std::move(fn));
+void StageGenerator::visit(const model::operation::ReduceMinNode &) {}
 
-    builder.append(std::move(acl_fn));
+void StageGenerator::visit(const model::operation::SplitNode &) {}
 
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
+void StageGenerator::visit(const model::operation::UnpackNode &) {}
 
-void StageGenerator::visit(const model::operation::DivNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)};
-  const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex lhs_index;
-    model::OperandIndex rhs_index;
-
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = output_index;
-  param.lhs_index = lhs_index;
-  param.rhs_index = rhs_index;
-
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto lhs_alloc = tensors->at(param.lhs_index).get();
-    auto rhs_alloc = tensors->at(param.rhs_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
-
-    l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::ExpNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::ExpNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LogisticNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = output_index;
-  param.ifm_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-    fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LogicalAndNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)};
-  const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input0_index;
-    model::OperandIndex input1_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input0_index = input0_index;
-  param.input1_index = input1_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input0_alloc = tensors->at(param.input0_index).get();
-    auto input1_alloc = tensors->at(param.input1_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
-
-    l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
-                 ::arm_compute::BinaryLogicalOperation::AND);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LSTMNode &node)
-{
-  // TODO Support dynamic rnn
-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
-  const auto scratch_buffer_index{
-      node.getOutputs().at(model::operation::LSTMNode::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(model::operation::LSTMNode::Output::CELL_STATE_OUT)};
-  const auto output_index{node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT)};
-
-  const auto input_index{node.getInputs().at(model::operation::LSTMNode::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{node.getInputs().at(
-      model::operation::LSTMNode::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
-  const auto input_gate_bias_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(model::operation::LSTMNode::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_WEIGHTS)}; // optional
-  const auto projection_bias_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_BIAS)}; // optional
-  const auto output_state_in_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{
-      node.getInputs().at(model::operation::LSTMNode::Input::CELL_STATE_IN)};
-  const auto cell_threshold = node.param().cell_threshold;
-  const auto projection_threshold = node.param().projection_threshold;
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
-  // true: no CIFG
-  // false: CIFG
-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE Although the projection weights has data the projection bias may not have data.
-  bool has_projection_param = has_projection_weights;
-
-  struct Param
-  {
-    model::OperandIndex scratch_buffer_index;
-    model::OperandIndex output_state_out_index;
-    model::OperandIndex cell_state_out_index;
-    model::OperandIndex output_index;
-
-    model::OperandIndex input_index;
-    model::OperandIndex input_to_forget_weights_index;
-    model::OperandIndex input_to_cell_weights_index;
-    model::OperandIndex input_to_output_weights_index;
-    model::OperandIndex recurrent_to_forget_weights_index;
-    model::OperandIndex recurrent_to_cell_weights_index;
-    model::OperandIndex recurrent_to_output_weights_index;
-    model::OperandIndex forget_gate_bias_index;
-    model::OperandIndex cell_bias_index;
-    model::OperandIndex output_gate_bias_index;
-    model::OperandIndex output_state_in_index;
-    model::OperandIndex cell_state_in_index;
-    model::Activation activation;
-    float cell_clip;
-    float projection_clip;
-
-    // CIFG params
-    model::OperandIndex input_to_input_weights_index;
-    model::OperandIndex recurrent_to_input_weights_index;
-    model::OperandIndex cell_to_input_weights_index;
-    model::OperandIndex input_gate_bias_index;
-
-    // peephole params
-    model::OperandIndex cell_to_forget_weights_index;
-    model::OperandIndex cell_to_output_weights_index;
-
-    // projection params
-    model::OperandIndex projection_weights_index;
-    model::OperandIndex projection_bias_index;
-
-    // LSTM options
-    bool has_cifg_param;
-    bool has_peephole_param;
-    bool has_projection_param;
-    bool has_projection_bias;
-  };
-
-  Param param;
-
-  param.scratch_buffer_index = scratch_buffer_index;
-  param.output_state_out_index = output_state_out_index;
-  param.cell_state_out_index = cell_state_out_index;
-  param.output_index = output_index;
-
-  param.input_index = input_index;
-  param.input_to_input_weights_index = input_to_input_weights_index;
-  param.input_to_forget_weights_index = input_to_forget_weights_index;
-  param.input_to_cell_weights_index = input_to_cell_weights_index;
-  param.input_to_output_weights_index = input_to_output_weights_index;
-  param.recurrent_to_input_weights_index = recurrent_to_input_weights_index;
-  param.recurrent_to_forget_weights_index = recurrent_to_forget_weights_index;
-  param.recurrent_to_cell_weights_index = recurrent_to_cell_weights_index;
-  param.recurrent_to_output_weights_index = recurrent_to_output_weights_index;
-  param.cell_to_input_weights_index = cell_to_input_weights_index;
-  param.cell_to_forget_weights_index = cell_to_forget_weights_index;
-  param.cell_to_output_weights_index = cell_to_output_weights_index;
-  param.input_gate_bias_index = input_gate_bias_index;
-  param.forget_gate_bias_index = forget_gate_bias_index;
-  param.cell_bias_index = cell_bias_index;
-  param.output_gate_bias_index = output_gate_bias_index;
-  param.projection_weights_index = projection_weights_index;
-  param.projection_bias_index = projection_bias_index;
-  param.output_state_in_index = output_state_in_index;
-  param.cell_state_in_index = cell_state_in_index;
-  param.activation = node.param().activation;
-  param.cell_clip = cell_threshold;
-  param.projection_clip = projection_threshold;
-  assert(param.cell_clip >= 0.f && param.projection_clip >= 0.f);
-
-  param.has_cifg_param = has_cifg_param;
-  param.has_peephole_param = has_peephole_param;
-  param.has_projection_param = has_projection_param;
-  param.has_projection_bias = has_projection_bias;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto scratch_buffer_alloc = tensors->at(param.scratch_buffer_index).get();
-    auto output_state_out_alloc = tensors->at(param.output_state_out_index).get();
-    auto cell_state_out_alloc = tensors->at(param.cell_state_out_index).get();
-    auto output_alloc = tensors->at(param.output_index).get();
-
-    auto input_alloc = tensors->at(param.input_index).get();
-    ;
-    auto input_to_forget_weights_alloc = tensors->at(param.input_to_forget_weights_index).get();
-    auto input_to_cell_weights_alloc = tensors->at(param.input_to_cell_weights_index).get();
-    auto input_to_output_weights_alloc = tensors->at(param.input_to_output_weights_index).get();
-    auto recurrent_to_forget_weights_alloc =
-        tensors->at(param.recurrent_to_forget_weights_index).get();
-    auto recurrent_to_cell_weights_alloc = tensors->at(param.recurrent_to_cell_weights_index).get();
-    auto recurrent_to_output_weights_alloc =
-        tensors->at(param.recurrent_to_output_weights_index).get();
-
-    auto forget_gate_bias_alloc = tensors->at(param.forget_gate_bias_index).get();
-    auto cell_bias_alloc = tensors->at(param.cell_bias_index).get();
-    auto output_gate_bias_alloc = tensors->at(param.output_gate_bias_index).get();
-    auto output_state_in_alloc = tensors->at(param.output_state_in_index).get();
-    auto cell_state_in_alloc = tensors->at(param.cell_state_in_index).get();
-
-    auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(param.activation);
-    ;
-    auto cell_clip = param.cell_clip;
-    auto proj_clip = param.projection_clip;
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
-
-    ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
-    if (param.has_cifg_param)
-    {
-      auto input_to_input_weights_alloc =
-          tensors->at(param.input_to_input_weights_index).get(); // optional
-      auto recurrent_to_input_weights_alloc =
-          tensors->at(param.recurrent_to_input_weights_index).get(); // optional
-      auto cell_to_input_weights_handle =
-          param.has_peephole_param ? tensors->at(param.cell_to_input_weights_index).get()->handle()
-                                   : nullptr; // optional (non-cifg && peephole)
-      auto input_gate_bias_alloc = tensors->at(param.input_gate_bias_index).get(); // optional
-      lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
-                                  recurrent_to_input_weights_alloc->handle(),
-                                  cell_to_input_weights_handle, input_gate_bias_alloc->handle());
-    }
-    if (param.has_peephole_param)
-    {
-      auto cell_to_forget_weights_alloc =
-          tensors->at(param.cell_to_forget_weights_index).get(); // optional
-      auto cell_to_output_weights_alloc =
-          tensors->at(param.cell_to_output_weights_index).get(); // optional
-      lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
-                                      cell_to_output_weights_alloc->handle());
-    }
-    if (param.has_projection_param)
-    {
-      auto projection_weights_alloc = tensors->at(param.projection_weights_index).get(); // optional
-      auto projection_bias_handle = param.has_projection_bias
-                                        ? tensors->at(param.projection_bias_index).get()->handle()
-                                        : nullptr; // optional
-      lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
-    }
-
-    l->configure(input_alloc->handle(), input_to_forget_weights_alloc->handle(),
-                 input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
-                 recurrent_to_forget_weights_alloc->handle(),
-                 recurrent_to_cell_weights_alloc->handle(),
-                 recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
-                 cell_bias_alloc->handle(), output_gate_bias_alloc->handle(),
-                 output_state_in_alloc->handle(), cell_state_in_alloc->handle(),
-                 scratch_buffer_alloc->handle(), output_state_out_alloc->handle(),
-                 cell_state_out_alloc->handle(), output_alloc->handle(), lstm_params, act_info,
-                 cell_clip, proj_clip);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReduceMaxNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::ReduceMaxNode::Input::INPUT)};
-  const auto axis_index{node.param().axis_index};
-
-  auto input_shape = _ctx.at(input_index).shape();
-  auto axis_shape = _ctx.at(axis_index).shape();
-
-  std::vector<uint32_t> axis;
-  {
-    const auto ifm_rank = input_shape.rank();
-    switch (axis_shape.rank())
-    {
-      case 0: // scalar
-      {
-        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
-        if (axis_value < 0)
-        {
-          axis_value += ifm_rank;
-        }
-        axis.emplace_back(
-            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        break;
-      }
-      case 1: // vector
-      {
-        const auto axis_base = _ctx.at(axis_index).data().base();
-        const int axis_size = axis_shape.num_elements();
-
-        // If axis's data does not exist as constant values and can be gotten as input data, we have
-        // to find a way to infer output shape when sinking output.
-        assert(axis_base != nullptr);
-        for (int32_t n = 0; n < axis_size; ++n)
-        {
-          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
-          if (axis_value < 0)
-          {
-            axis_value += ifm_rank;
-          }
-          axis.emplace_back(
-              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error("Not supported");
-        break;
-    }
-  }
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-
-    std::vector<uint32_t> axis_index;
-    uint32_t input_rank;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-  param.axis_index = axis;
-  param.input_rank = input_shape.rank();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.output_index).get();
-    auto ifm_alloc = tensors->at(param.input_index).get();
-    std::set<uint32_t> axes;
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-    // CWHN -> WHCN
-    uint32_t permutation[4] = {2, 0, 1, 3};
-    for (size_t i = 0; i < param.axis_index.size(); ++i)
-    {
-      if (acl_layout == ::arm_compute::DataLayout::NCHW && param.input_rank == 4)
-      {
-        axes.insert(permutation[param.axis_index[i]]);
-      }
-      else
-      {
-        axes.insert(param.axis_index[i]);
-      }
-    }
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, arm_compute::ReduceOperation::MAX);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ComparisonNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)};
-  const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input0_index;
-    model::OperandIndex input1_index;
-
-    model::operation::ComparisonNode::ComparisonType comparison_type;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input0_index = input0_index;
-  param.input1_index = input1_index;
-
-  param.comparison_type = node.param().comparison_type;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input0_alloc = tensors->at(param.input0_index).get();
-    auto input1_alloc = tensors->at(param.input1_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
-
-    l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
-                 (arm_compute::ComparisonOperation)param.comparison_type);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::RSQRTNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = output_index;
-  param.ifm_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
-
-    fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
-
-    builder.append(asAclFunction(std::move(fn)));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReLUNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::ReLUNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
-
-    fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ResizeBilinearNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-
-  const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinearNode::Input::INPUT)};
-  const auto height_index{node.param().height_index};
-  const auto width_index{node.param().width_index};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    int32_t new_height;
-    int32_t new_width;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.new_height = _ctx.at(height_index).asScalar<int32_t>();
-  param.new_width = _ctx.at(width_index).asScalar<int32_t>();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(),
-                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
-                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReLU1Node &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::ReLU1Node::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReLU6Node &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::ReLU6Node::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::RNNNode &node)
-{
-  const auto output_index{node.getOutputs().at(model::operation::RNNNode::Output::OUTPUT)};
-  const auto hidden_state_out_index{
-      node.getOutputs().at(model::operation::RNNNode::Output::HIDDEN_STATE_OUT)};
-
-  const auto input_index{node.getInputs().at(model::operation::RNNNode::Input::INPUT)};
-  const auto weights_index{node.getInputs().at(model::operation::RNNNode::Input::WEIGHTS)};
-  const auto recurrent_weights_index{
-      node.getInputs().at(model::operation::RNNNode::Input::RECURRENT_WEIGHTS)};
-  const auto bias_index{node.getInputs().at(model::operation::RNNNode::Input::BIAS)};
-  const auto hidden_state_in_index{
-      node.getInputs().at(model::operation::RNNNode::Input::HIDDEN_STATE_IN)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex hidden_state_out_index;
-
-    model::OperandIndex input_index;
-    model::OperandIndex weights_index;
-    model::OperandIndex recurrent_weights_index;
-    model::OperandIndex bias_index;
-    model::OperandIndex hidden_state_in_index;
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.hidden_state_out_index = hidden_state_out_index;
-
-  param.input_index = input_index;
-  param.weights_index = weights_index;
-  param.recurrent_weights_index = recurrent_weights_index;
-  param.bias_index = bias_index;
-  param.hidden_state_in_index = hidden_state_in_index;
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto hidden_state_out_alloc = tensors->at(param.hidden_state_out_index).get();
-
-    auto input_alloc = tensors->at(param.input_index).get();
-    auto weights_alloc = tensors->at(param.weights_index).get();
-    auto recurrent_weights_alloc = tensors->at(param.recurrent_weights_index).get();
-    auto bias_alloc = tensors->at(param.bias_index).get();
-    auto hidden_state_in_alloc = tensors->at(param.hidden_state_in_index).get();
-    auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(param.activation);
-
-    auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
-    copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
-    builder.append(asAclFunction(std::move(copy_layer)));
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-    auto rnn_layer = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>();
-    rnn_layer->configure(input_alloc->handle(), weights_alloc->handle(),
-                         recurrent_weights_alloc->handle(), bias_alloc->handle(),
-                         hidden_state_out_alloc->handle(), output_alloc->handle(), act_info);
-    fn = std::move(rnn_layer);
-    builder.append(asAclFunction(std::move(fn)));
-  });
-}
-
-void StageGenerator::visit(const model::operation::FloorNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::FloorNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SpaceToDepthNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)};
-  const auto block_size_index{node.param().block_size_index};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    int32_t block_size;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.block_size = _ctx.at(block_size_index).asScalar<int32_t>();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), param.block_size);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::L2Pool2DNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::L2Pool2DNode::Input::INPUT)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    uint32_t kw;
-    uint32_t kh;
-
-    model::ExplicitPadding padding;
-    model::Stride stride;
-    model::Activation activation;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  param.kw = node.param().kw;
-  param.kh = node.param().kh;
-  param.stride = node.param().stride;
-  param.padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
-                                                 param.stride, param.kw, param.kh);
-  param.activation = node.param().activation;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    ::arm_compute::PoolingLayerInfo info{
-        ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{param.kw, param.kh},
-        ::neurun::backend::acl_common::asPadStrideInfo(param.padding, param.stride)};
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc->handle());
-  });
-}
-
-void StageGenerator::visit(const model::operation::EmbeddingLookupNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto lookups_index{
-      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::LOOKUPS)};
-  const auto values_index{
-      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex lookups_index;
-    model::OperandIndex values_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.lookups_index = lookups_index;
-  param.values_index = values_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto lookups_alloc = tensors->at(param.lookups_index).get();
-    auto values_alloc = tensors->at(param.values_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
-
-    l->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::L2NormalizationNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::L2NormalizationNode::Input::INPUT)};
-
-  // {CL|Neon}L2Normalization performs the reduction only along dimension 0
-  // L2 Normalization always performs the reduction along the depth axis
-  // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
-  // choosing normalization parameters as below
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    int32_t radius;
-    float alpha;
-    float beta;
-    float bias;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  param.radius = 2 * _ctx.at(ifm_index).shape().dim(3) + 1; // normSize = depth * 2 + 1
-  param.alpha = 1.0f; // In the implementation to make alpha_ become 1
-  param.beta = 0.5f;  // pow(reduction, -0.5) = 1 / sqrt(reduction)
-  param.bias = 0.0f;  // Don't offset the reduction.
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    const auto norm_info =
-        ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, param.radius,
-                                              param.alpha, param.beta, param.bias, false);
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::HashtableLookupNode &node)
-{
-  const auto output_index{
-      node.getOutputs().at(model::operation::HashtableLookupNode::Output::OUTPUT)};
-  const auto hits_index{node.getOutputs().at(model::operation::HashtableLookupNode::Output::HITS)};
-
-  const auto lookups_index{
-      node.getInputs().at(model::operation::HashtableLookupNode::Input::LOOKUPS)};
-  const auto keys_index{node.getInputs().at(model::operation::HashtableLookupNode::Input::KEYS)};
-  const auto values_index{
-      node.getInputs().at(model::operation::HashtableLookupNode::Input::VALUES)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex hits_index;
-
-    model::OperandIndex lookups_index;
-    model::OperandIndex keys_index;
-    model::OperandIndex values_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.hits_index = hits_index;
-
-  param.lookups_index = lookups_index;
-  param.keys_index = keys_index;
-  param.values_index = values_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto hits_alloc = tensors->at(param.hits_index).get();
-
-    auto lookups_alloc = tensors->at(param.lookups_index).get();
-    auto keys_alloc = tensors->at(param.keys_index).get();
-    auto values_alloc = tensors->at(param.values_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
-
-    l->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
-                 output_alloc->handle(), hits_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::PReLUNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)};
-  const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)};
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    model::OperandIndex alpha_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.alpha_index = alpha_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto alpha_alloc = tensors->at(param.alpha_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
-
-    l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::TransposeConvNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto output_shape_index{
-      node.getInputs().at(model::operation::TransposeConvNode::Input::OUTPUT_SHAPE)};
-  const auto ker_index{node.getInputs().at(model::operation::TransposeConvNode::Input::KERNEL)};
-  const auto ifm_index{node.getInputs().at(model::operation::TransposeConvNode::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    model::OperandIndex ker_index;
-
-    model::ExplicitPadding padding;
-    model::Stride stride;
-    uint32_t invalid_horizontal;
-    uint32_t invalid_vertical;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.ker_index = ker_index;
-
-  param.stride = node.param().stride;
-
-  assert((node.param().padding.type == model::PaddingType::SAME) ||
-         (node.param().padding.type == model::PaddingType::VALID));
-  param.padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape,
-                                                 param.stride, ker_shape.W, ker_shape.H);
-  if (node.param().padding.type == model::PaddingType::VALID)
-  {
-    param.invalid_horizontal =
-        ofm_shape.W - (1 + (ifm_shape.W - 1) * param.stride.horizontal) - (ker_shape.W - 1);
-    param.invalid_vertical =
-        ofm_shape.H - (1 + (ifm_shape.H - 1) * param.stride.vertical) - (ker_shape.H - 1);
-  }
-  else
-  {
-    param.invalid_horizontal = 0;
-    param.invalid_vertical = 0;
-  }
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto ker_alloc = tensors->at(param.ker_index).get();
-
-    const auto tconv_info = acl_common::asPadStrideInfo(param.padding, param.stride);
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>();
-
-    l->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
-                 param.invalid_vertical, param.invalid_horizontal);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SQRTNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::SQRTNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle(), act_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LogicalOrNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)};
-  const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input0_index;
-    model::OperandIndex input1_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input0_index = input0_index;
-  param.input1_index = input1_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input0_alloc = tensors->at(param.input0_index).get();
-    auto input1_alloc = tensors->at(param.input1_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
-
-    l->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LogicalNotNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::LogicalNotNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SquaredDifferenceNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)};
-  const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex lhs_index;
-    model::OperandIndex rhs_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.lhs_index = lhs_index;
-  param.rhs_index = rhs_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto lhs_alloc = tensors->at(param.lhs_index).get();
-    auto rhs_alloc = tensors->at(param.rhs_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
-
-    l->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::TopKV2Node &node)
-{
-  const auto outputValues_index{
-      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_VALUES)};
-  const auto outputIndices_index{
-      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_INDICES)};
-
-  const auto inputData_index{node.getInputs().at(model::operation::TopKV2Node::Input::INPUT)};
-  const auto k_index{node.param().k_index};
-
-  // Currently, we only support the vector input.
-  assert(_ctx.at(inputData_index).shape().rank() == 1 ||
-         _ctx.at(inputData_index).shape().rank() == 2);
-
-  const int32_t k = _ctx.at(k_index).asScalar<int32_t>();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex outputValues_index;
-    model::OperandIndex outputIndices_index;
-
-    model::OperandIndex inputData_index;
-    int32_t k;
-  };
-
-  Param param;
-
-  param.outputValues_index = outputValues_index;
-  param.outputIndices_index = outputIndices_index;
-  param.inputData_index = inputData_index;
-  param.k = k;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto values_alloc = tensors->at(param.outputValues_index).get();
-    auto indices_alloc = tensors->at(param.outputIndices_index).get();
-    auto input_alloc = tensors->at(param.inputData_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
-
-    l->configure(input_alloc->handle(), param.k, values_alloc->handle(), indices_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::GatherNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-
-  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
-  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
-
-  const auto axis_index{node.param().axis_index};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape();
-
-  const int32_t axis_value = static_cast<int>(_ctx.at(axis_index).asScalar<int32_t>());
-  const int axis =
-      ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    model::OperandIndex indices_index;
-
-    int32_t axis;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.indices_index = indices_index;
-
-  param.axis = axis;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto indices_alloc = tensors->at(param.indices_index).get();
-    auto acl_layout = ofm_alloc->handle()->info()->data_layout();
-    UNUSED_RELEASE(acl_layout);
-
-    // NOTE The frontend layout and backend layout must be the same for this operation.
-    //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
-    //      is not not efficient even if it works well. If so, it would be better to set the
-    //      layout of these backend tensors to the same layout.
-    //      There is also one thing we have to think about. This operation depends on the layout of
-    //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
-    //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
-    //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-    // TODO Remove this workaround
-    //      It is a workaround how to set the layout of these backend tensors to the layout of the
-    //      frontend when creating them
-    // TODO Supports front-end in NCHW
-    // TODO Change the layout of frontend and backend to be the same
-    // assert(::arm_compute::DataLayout::NHWC == acl_layout);
-    assert(acl_layout == ifm_alloc->handle()->info()->data_layout());
-    assert(acl_layout == indices_alloc->handle()->info()->data_layout());
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-    // TODO Change to CLGather
-    auto l = nnfw::cpp14::make_unique<::arm_compute::misc::GenericGather>();
-
-    l->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), param.axis);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::NegNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::NegNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::AbsNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::AbsNode::Input::INPUT)};
-
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    const ::arm_compute::ActivationLayerInfo act_info{
-        ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle(), act_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ArgMaxNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)};
-  const auto axis_index{node.param().axis_index};
-
-  auto ifm_shape = _ctx.at(ifm_index).shape();
-  auto ofm_shape = _ctx.at(ofm_index).shape();
-  auto axis_shape = _ctx.at(axis_index).shape();
-
-  assert(_ctx.at(axis_index).isConstant());
-  // Axis dimension is always 1.
-  assert(axis_shape.rank() == 1);
-  assert((ifm_shape.rank() - 1) == ofm_shape.rank());
-
-  std::vector<uint32_t> l_axis;
-  const int axis_size = axis_shape.num_elements();
-  auto axis_base = _ctx.at(axis_index).data().base();
-  // TODO Should support axis size > 1.
-  assert(axis_size == 1);
-  // axis is tensor with 1 dimension - always a vector.
-  assert(axis_base != nullptr);
-
-  for (int32_t n = 0; n < axis_size; ++n)
-  {
-    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
-    if (axis_value < 0)
-    {
-      axis_value += ifm_shape.rank();
-    }
-    l_axis.push_back(acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value());
-  }
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    std::vector<uint32_t> axis;
-    uint32_t ifm_rank;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.axis = l_axis;
-  param.ifm_rank = ifm_shape.rank();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    auto axis = param.axis;
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-    if (acl_layout == ::arm_compute::DataLayout::NCHW && param.ifm_rank == 4)
-    {
-      // CWHN -> WHCN
-      uint32_t permutation[4] = {2, 0, 1, 3};
-      for (size_t i = 0; i < axis.size(); ++i)
-      {
-        axis[i] = permutation[axis[i]];
-      }
-    }
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), axis, ::arm_compute::ArgOperation::MAX);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::DequantizeNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::DequantizeNode::Input::INPUT)};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle());
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::MeanNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::MeanNode::Input::INPUT)};
-
-  const auto axis_index{node.param().axis_index};
-  const auto keep_dims_index{node.param().keep_dims_index};
-
-  const int keep_dims = _ctx.at(keep_dims_index).asScalar<int>();
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape();
-
-  std::vector<uint32_t> axis;
-  {
-    const auto ifm_rank = ifm_shape.rank();
-    const auto axis_shape = _ctx.at(axis_index).shape();
-    switch (axis_shape.rank())
-    {
-      case 0: // scalar
-      {
-        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
-        if (axis_value < 0)
-        {
-          axis_value += ifm_rank;
-        }
-        axis.emplace_back(
-            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        break;
-      }
-      case 1: // vector
-      {
-        const auto axis_base = _ctx.at(axis_index).data().base();
-        const int axis_size = axis_shape.num_elements();
-
-        // If axis's data does not exist as constant values and can be gotten as input data, we have
-        // to find a way to infer output shape when sinking output.
-        assert(axis_base != nullptr);
-        for (int32_t n = 0; n < axis_size; ++n)
-        {
-          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
-          if (axis_value < 0)
-          {
-            axis_value += ifm_rank;
-          }
-          axis.emplace_back(
-              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error("Not supported");
-        break;
-    }
-  }
-
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    bool keep_dims;
-    std::vector<uint32_t> axis;
-    uint32_t ifm_rank;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.keep_dims = keep_dims > 0 ? true : false;
-  param.axis = axis;
-  param.ifm_rank = ifm_shape.rank();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    std::set<uint32_t> axis;
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-    // CWHN -> WHCN
-    uint32_t permutation[4] = {2, 0, 1, 3};
-    for (size_t i = 0; i < param.axis.size(); ++i)
-    {
-      if (acl_layout == ::arm_compute::DataLayout::NCHW && param.ifm_rank == 4)
-      {
-        axis.insert(permutation[param.axis[i]]);
-      }
-      else
-      {
-        axis.insert(param.axis[i]);
-      }
-    }
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    // NOTE CLReduceMean has a bug that does not support NHWC layout
-    //      CLReduceMean intermediate tensors are always NCHW layout
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), axis,
-                 ::arm_compute::ReduceOperation::MEAN);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::LocalResponseNormalizationNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{
-      node.getInputs().at(model::operation::LocalResponseNormalizationNode::Input::INPUT)};
-  const auto radius_index{node.param().radius_index};
-  const auto bias_index{node.param().bias_index};
-  const auto alpha_index{node.param().alpha_index};
-  const auto beta_index{node.param().beta_index};
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-
-    int32_t radius;
-    float bias;
-    float alpha;
-    float beta;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-
-  param.radius = _ctx.at(radius_index).asScalar<int32_t>();
-  param.alpha = _ctx.at(alpha_index).asScalar<float>();
-  param.beta = _ctx.at(beta_index).asScalar<float>();
-  param.bias = _ctx.at(bias_index).asScalar<float>();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-
-    const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
-                                                                 param.radius * 2 + 1, param.alpha,
-                                                                 param.beta, param.bias, false);
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::DepthToSpaceNode &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(model::operation::DepthToSpaceNode::Input::INPUT)};
-  const auto block_size_index{node.param().block_size_index};
-
-  int32_t block_size = _ctx.at(block_size_index).asScalar<int32_t>();
-  assert(block_size > 0);
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex output_index;
-    model::OperandIndex input_index;
-    int32_t block_size;
-  };
-
-  Param param;
-
-  param.output_index = output_index;
-  param.input_index = input_index;
-  param.block_size = block_size;
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto output_alloc = tensors->at(param.output_index).get();
-    auto input_alloc = tensors->at(param.input_index).get();
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
-
-    l->configure(input_alloc->handle(), output_alloc->handle(), param.block_size);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::ReduceMinNode &node)
-{
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(model::operation::ReduceMinNode::Input::INPUT)};
-  const auto axis_index{node.param().axis_index};
-
-  auto ifm_shape = _ctx.at(ifm_index).shape();
-  auto ofm_shape = _ctx.at(ofm_index).shape();
-  auto axis_shape = _ctx.at(axis_index).shape();
-
-  std::vector<uint32_t> axis;
-  {
-    const auto ifm_rank = ifm_shape.rank();
-    switch (axis_shape.rank())
-    {
-      case 0: // scalar
-      {
-        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
-        if (axis_value < 0)
-        {
-          axis_value += ifm_rank;
-        }
-        axis.emplace_back(
-            ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        break;
-      }
-      case 1: // vector
-      {
-        const auto axis_base = _ctx.at(axis_index).data().base();
-        const int axis_size = axis_shape.num_elements();
-
-        // If axis's data does not exist as constant values and can be gotten as input data, we have
-        // to find a way to infer output shape when sinking output.
-        assert(axis_base != nullptr);
-        for (int32_t n = 0; n < axis_size; ++n)
-        {
-          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
-          if (axis_value < 0)
-          {
-            axis_value += ifm_rank;
-          }
-          axis.emplace_back(
-              ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value());
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error("Not supported");
-        break;
-    }
-  }
-
-  // Construct operation parameters
-  struct Param
-  {
-    model::OperandIndex ofm_index;
-    model::OperandIndex ifm_index;
-    std::vector<uint32_t> axis;
-    uint32_t ifm_rank;
-  };
-
-  Param param;
-
-  param.ofm_index = ofm_index;
-  param.ifm_index = ifm_index;
-  param.axis = axis;
-  param.ifm_rank = ifm_shape.rank();
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ofm_alloc = tensors->at(param.ofm_index).get();
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    std::set<uint32_t> axis;
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-    // CWHN -> WHCN
-    uint32_t permutation[4] = {2, 0, 1, 3};
-    for (size_t i = 0; i < param.axis.size(); ++i)
-    {
-      if (acl_layout == ::arm_compute::DataLayout::NCHW && param.ifm_rank == 4)
-      {
-        axis.insert(permutation[param.axis[i]]);
-      }
-      else
-      {
-        axis.insert(param.axis[i]);
-      }
-    }
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
-
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), axis,
-                 ::arm_compute::ReduceOperation::MIN);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::SplitNode &node)
-{
-  const auto input_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)};
-  const auto axis_index{node.param().axis_index};
-  const auto num_of_splits_index{node.param().num_of_splits_index};
-
-  assert(_ctx.at(num_of_splits_index).asScalar<unsigned int>() == node.getOutputs().size());
-
-  const auto ifm_rank = _ctx.at(input_index).shape().rank();
-
-  struct Param
-  {
-    model::OperandIndex ifm_index;
-    std::vector<model::OperandIndex> output_indexes;
-    int32_t axis;
-    uint32_t ifm_rank;
-  };
-
-  Param param;
-  param.ifm_index = input_index;
-  param.axis = _ctx.at(axis_index).asScalar<int32_t>();
-  if (param.axis < 0)
-    param.axis += ifm_rank;
-  param.axis = acl_common::ToARMComputeAxis(ifm_rank, param.axis).value();
-  param.ifm_rank = ifm_rank;
-
-  for (const auto &e : node.getOutputs())
-    param.output_indexes.emplace_back(e);
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto ifm_alloc = tensors->at(param.ifm_index).get();
-    std::vector<arm_compute::ICLTensor *> output_allocs;
-    for (auto ofm_ind : param.output_indexes)
-    {
-      output_allocs.emplace_back(tensors->at(ofm_ind).get()->handle());
-    }
-    auto axis = param.axis;
-    auto acl_layout = ifm_alloc->handle()->info()->data_layout();
-
-    if (acl_layout == ::arm_compute::DataLayout::NCHW && param.ifm_rank == 4)
-    {
-      // CWHN -> WHCN
-      uint32_t permutation[4] = {2, 0, 1, 3};
-      axis = permutation[axis];
-    }
-
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    // TODO Support NCHW frontend
-    // TODO Change the layout of frontend and backend to be the same
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
-
-    l->configure(ifm_alloc->handle(), output_allocs, axis);
-
-    fn = std::move(l);
-
-    auto acl_fn = asAclFunction(std::move(fn));
-
-    builder.append(std::move(acl_fn));
-  });
-}
-
-void StageGenerator::visit(const model::operation::UnpackNode &node)
-{
-  const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)};
-  const auto axis{node.param().axis};
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  struct Param
-  {
-    model::OperandIndex input_index;
-    std::vector<model::OperandIndex> output_indexes;
-    int32_t axis;
-  };
-
-  Param param;
-  param.input_index = input_index;
-  param.axis = axis;
-  if (param.axis < 0)
-    param.axis += input_rank;
-  param.axis = acl_common::ToARMComputeAxis(input_rank, param.axis).value();
-
-  for (const auto &output_index : node.getOutputs())
-    param.output_indexes.emplace_back(output_index);
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto input = tensors->at(param.input_index).get()->handle();
-    std::vector<arm_compute::ICLTensor *> outputs;
-    for (const auto output_index : param.output_indexes)
-    {
-      outputs.emplace_back(tensors->at(output_index)->handle());
-    }
-
-    int axis = param.axis;
-    if (input->info()->num_dimensions() == 4 &&
-        input->info()->data_layout() == ::arm_compute::DataLayout::NCHW)
-    {
-      // CWHN -> WHCN
-      const int permutation[4] = {2, 0, 1, 3};
-      axis = permutation[axis];
-    }
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
-
-    fn->configure(input, outputs, axis);
-
-    builder.append(asAclFunction(std::move(fn)));
-  });
-}
-
-void StageGenerator::visit(const model::operation::PadNode &node)
-{
-  const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)};
-  const auto pad_index{node.getInputs().at(model::operation::PadNode::Input::PAD)};
-  const auto output_index{node.getOutputs().at(0)};
-
-  struct Param
-  {
-    model::OperandIndex input_index;
-    model::OperandIndex output_index;
-    ::arm_compute::PixelValue pixel_value;
-    ::arm_compute::PaddingList padding_list;
-  };
-
-  assert(_ctx.at(pad_index).isConstant());
-
-  Param param;
-  param.input_index = input_index;
-  param.output_index = output_index;
-
-  auto rank = _ctx.at(pad_index).shape().dim(0);
-  auto pad_base = _ctx.at(pad_index).data().base();
-  param.padding_list.resize(rank);
-  for (int32_t n = 0; n < rank; ++n)
-  {
-    const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
-    auto axis = acl_common::ToARMComputeAxis(rank, n).value();
-
-    param.padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
-  }
-
-  auto input_type = _ctx.at(input_index).typeInfo();
-  auto data_type = acl_common::asDataType(input_type.type());
-  auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
-  param.pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
-
-  auto tensors = _tensor_builder;
-
-  returnStage([tensors, param](IExecutionBuilder &builder) {
-    auto input = tensors->at(param.input_index).get()->handle();
-    auto output = tensors->at(param.output_index).get()->handle();
-
-    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
-    fn->configure(input, output, param.padding_list, param.pixel_value);
-
-    builder.append(asAclFunction(std::move(fn)));
-  });
-}
+void StageGenerator::visit(const model::operation::PadNode &) {}
 
 } // namespace acl_cl
 } // namespace backend
-- 
2.7.4