runtime/neurun/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclFunction.h>
  23 #include <Convert.h>
  24 #include <Swizzle.h>
  25
  26 #include "kernel/ConcatLayer.h"
  27 #include "model/Index.h"
  28 #include "model/DataType.h"
  29 #include "model/InternalType.h"
  30 #include "compiler/IExecutionBuilder.h"
  31 #include "exec/NopFunction.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "util/Padding.h"
  35
  36 using ::neurun::compiler::IExecutionBuilder;
  37
  38 namespace neurun
  39 {
  40 namespace backend
  41 {
  42 namespace acl_cl
  43 {
  44
  45 using ::neurun::backend::acl_common::asAclFunction;
  46
  47 //
  48 // ActivationBuilder
  49 //
  50 class ActivationBuilder
  51 {
  52 public:
  53   explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
  54   {
  55     // DO NOTHING
  56   }
  57
  58 private:
  59   void appendReLU(::arm_compute::ICLTensor *ifm_alloc);
  60   void appendReLU1(::arm_compute::ICLTensor *ifm_alloc);
  61   void appendReLU6(::arm_compute::ICLTensor *ifm_alloc);
  62
  63 public:
  64   void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc);
  65
  66 private:
  67   IExecutionBuilder &_builder;
  68 };
  69
  70 void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
  71 {
  72   const ::arm_compute::ActivationLayerInfo act_info{
  73       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
  74
  75   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
  76
  77   fn->configure(ifm_alloc, nullptr, act_info);
  78
  79   auto acl_fn = asAclFunction(std::move(fn));
  80
  81   _builder.append(std::move(acl_fn));
  82 }
  83
  84 void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
  85 {
  86   const ::arm_compute::ActivationLayerInfo act_info{
  87       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
  88
  89   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
  90
  91   fn->configure(ifm_alloc, nullptr, act_info);
  92
  93   auto acl_fn = asAclFunction(std::move(fn));
  94
  95   _builder.append(std::move(acl_fn));
  96 }
  97
  98 void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
  99 {
 100   const ::arm_compute::ActivationLayerInfo act_info{
 101       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
 102
 103   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
 104
 105   fn->configure(ifm_alloc, nullptr, act_info);
 106
 107   auto acl_fn = asAclFunction(std::move(fn));
 108
 109   _builder.append(std::move(acl_fn));
 110 }
 111
 112 void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
 113 {
 114   switch (code)
 115   {
 116     case model::Activation::NONE:
 117     {
 118       // DO NOTHING
 119       break;
 120     }
 121     case model::Activation::RELU:
 122     {
 123       appendReLU(ifm_alloc);
 124       break;
 125     }
 126     case model::Activation::RELU1:
 127     {
 128       appendReLU1(ifm_alloc);
 129       break;
 130     }
 131     case model::Activation::RELU6:
 132     {
 133       appendReLU6(ifm_alloc);
 134       break;
 135     }
 136     default:
 137     {
 138       throw std::runtime_error("Not supported, yet");
 139     }
 140   }
 141 }
 142
 143 //
 144 // KernelGenerator
 145 //
 146 KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
 147                                  const std::shared_ptr<TensorBuilder> &tensor_builder)
 148     : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN)
 149 {
 150   // DO NOTHING
 151 }
 152
 153 void KernelGenerator::visit(const model::Subgraph &subgraph)
 154 {
 155   _current_subg_layout = subgraph.getLayout();
 156   for (const auto &e : subgraph.operations())
 157   {
 158     const auto &node = *(e.node);
 159     _tensor_builder->preVisit(node);
 160     node.accept(*this);
 161     _tensor_builder->postVisit(node);
 162   }
 163 }
 164
 165 void KernelGenerator::visit(const model::operation::BatchToSpaceND &node)
 166 {
 167   const auto ofm_index{node.getOutputs().at(0)};
 168   const auto ifm_index{node.getInputs().at(model::operation::BatchToSpaceND::Input::INPUT)};
 169   const auto block_size_index{
 170       node.getInputs().at(model::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 171
 172   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 173   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 174   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
 175
 176   assert(_ctx.at(block_size_index).isConstant());
 177
 178   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBatchToSpaceLayer>();
 179
 180   fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
 181
 182   auto acl_fn = asAclFunction(std::move(fn));
 183
 184   _execution_builder->append(std::move(acl_fn));
 185 }
 186
 187 void KernelGenerator::visit(const model::operation::Cast &node)
 188 {
 189   const auto ofm_index{node.getOutputs().at(0)};
 190   const auto ifm_index{node.getInputs().at(model::operation::Cast::Input::INPUT)};
 191
 192   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 193   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 194
 195   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
 196
 197   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
 198
 199   auto acl_fn = asAclFunction(std::move(fn));
 200
 201   _execution_builder->append(std::move(acl_fn));
 202 }
 203
 204 void KernelGenerator::visit(const model::operation::Conv2D &node)
 205 {
 206   using model::operation::Conv2D;
 207
 208   const auto ofm_index{node.getOutputs().at(0)};
 209   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 210   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 211   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 212
 213   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
 214   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
 215   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 216   const auto &ker_shape = _ctx.at(ker_index).shape();
 217   const auto ker_height = ker_shape.dim(1);
 218   const auto ker_width = ker_shape.dim(2);
 219
 220   const auto stride = node.param().stride;
 221   const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
 222                                                       stride, ker_width, ker_height);
 223   const auto activation = node.param().activation;
 224
 225   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 226   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 227   auto ker_alloc = _tensor_builder->at(ker_index).get();
 228   auto bias_alloc = _tensor_builder->at(bias_index).get();
 229
 230   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 231   const auto act_info = acl_common::asActivationLayerInfo(activation);
 232
 233   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>(
 234       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 235
 236   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
 237                 conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 238
 239   _execution_builder->append(asAclFunction(std::move(fn)));
 240 }
 241
 242 void KernelGenerator::visit(const model::operation::DepthwiseConv2D &node)
 243 {
 244   using model::operation::DepthwiseConv2D;
 245
 246   const auto ofm_index{node.getOutputs().at(0)};
 247   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 248   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 249   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 250
 251   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
 252   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
 253   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 254   const auto &ker_shape = _ctx.at(ker_index).shape();
 255   const auto ker_height = ker_shape.dim(1);
 256   const auto ker_width = ker_shape.dim(2);
 257
 258   const auto stride = node.param().stride;
 259   const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
 260                                                       stride, ker_width, ker_height);
 261   const auto multiplier = node.param().multiplier;
 262   const auto activation = node.param().activation;
 263
 264   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 265   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 266   auto ker_alloc = _tensor_builder->at(ker_index).get();
 267   auto bias_alloc = _tensor_builder->at(bias_index).get();
 268
 269   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 270   const auto act_info = acl_common::asActivationLayerInfo(activation);
 271
 272   if (ker_height == 3 && ker_width == 3)
 273   {
 274     auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>(
 275         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 276
 277     fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
 278                   ofm_alloc->handle(), conv_info, multiplier, act_info);
 279
 280     _execution_builder->append(asAclFunction(std::move(fn)));
 281   }
 282   else
 283   {
 284     auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 285
 286     fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
 287                   ofm_alloc->handle(), conv_info, multiplier, act_info);
 288
 289     _execution_builder->append(asAclFunction(std::move(fn)));
 290   }
 291 }
 292
 293 void KernelGenerator::visit(const model::operation::MaxPool2D &node)
 294 {
 295   const auto ofm_index{node.getOutputs().at(0)};
 296   const auto ifm_index{node.getInputs().at(model::operation::MaxPool2D::Input::INPUT)};
 297
 298   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
 299   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
 300
 301   const auto kh = node.param().kh;
 302   const auto kw = node.param().kw;
 303   const auto stride = node.param().stride;
 304   const auto padding =
 305       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 306   const auto activation = node.param().activation;
 307
 308   VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
 309   VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
 310   VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
 311   VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
 312   VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
 313   VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
 314   VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
 315   VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
 316   VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
 317   VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
 318   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
 319   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
 320
 321   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 322   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 323
 324   ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
 325                                        ::arm_compute::Size2D{kw, kh},
 326                                        acl_common::asPadStrideInfo(padding, stride)};
 327
 328   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
 329
 330   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
 331
 332   auto acl_fn = asAclFunction(std::move(fn));
 333
 334   _execution_builder->append((std::move(acl_fn)));
 335
 336   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 337 }
 338
 339 void KernelGenerator::visit(const model::operation::AvgPool2D &node)
 340 {
 341   const auto ofm_index{node.getOutputs().at(0)};
 342   const auto ifm_index{node.getInputs().at(model::operation::AvgPool2D::Input::INPUT)};
 343
 344   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
 345   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
 346
 347   const auto kh = node.param().kh;
 348   const auto kw = node.param().kw;
 349   const auto stride = node.param().stride;
 350   const auto padding =
 351       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 352   const auto activation = node.param().activation;
 353
 354   VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
 355   VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
 356   VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
 357   VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
 358   VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
 359   VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
 360   VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
 361   VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
 362   VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
 363   VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
 364   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
 365   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
 366
 367   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 368   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 369
 370   ::arm_compute::PoolingLayerInfo info{
 371       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
 372       acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
 373
 374   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
 375
 376   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
 377
 378   auto acl_fn = asAclFunction(std::move(fn));
 379
 380   _execution_builder->append((std::move(acl_fn)));
 381
 382   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 383 }
 384
 385 void KernelGenerator::visit(const model::operation::Concat &node)
 386 {
 387   const auto ofm_index{node.getOutputs().at(0)};
 388
 389   std::vector<model::OperandIndex> input_indexes;
 390
 391   for (const auto &input : node.getInputs())
 392     input_indexes.emplace_back(input);
 393
 394   const auto axis = node.param().axis;
 395
 396   // If tensor allocator allocate as subtensor
 397   bool canEliminate = true;
 398   for (auto &ifm_ind : input_indexes)
 399   {
 400     if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
 401     {
 402       canEliminate = false;
 403       break;
 404     }
 405   }
 406   if (canEliminate)
 407   {
 408     // If concat eliminated, return a NOP IFunction
 409     _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
 410     return;
 411   }
 412
 413   auto output_alloc = _tensor_builder->at(ofm_index).get();
 414
 415   std::vector<operand::ICLTensor *> input_allocs;
 416   for (auto &ifm_ind : input_indexes)
 417     input_allocs.emplace_back(_tensor_builder->at(ifm_ind).get());
 418
 419   auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
 420
 421   const auto rank = _ctx.at(ofm_index).shape().rank();
 422   const auto frontend_layout = _current_subg_layout;
 423   const auto backend_layout = output_alloc->layout();
 424   const auto fixed_axis =
 425       acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 426
 427   fn->configure(input_allocs, fixed_axis, output_alloc);
 428
 429   auto acl_fn = asAclFunction(std::move(fn));
 430
 431   _execution_builder->append(std::move(acl_fn));
 432 }
 433
 434 void KernelGenerator::visit(const model::operation::FullyConnected &node)
 435 {
 436   using model::operation::FullyConnected;
 437
 438   const auto output_index{node.getOutputs().at(0)};
 439   const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
 440   const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
 441   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
 442
 443   const auto input_rank = _ctx.at(input_index).shape().rank();
 444   // TODO Currently we are not handling where the case is that the input's rank is 3.
 445   // The handling should be added in the future.
 446   assert(input_rank != 3);
 447
 448   const auto output_size = _ctx.at(output_index).shape().dim(1);
 449   UNUSED_RELEASE(output_size);
 450   assert(_ctx.at(bias_index).shape().dim(0) == output_size);
 451   assert(_ctx.at(weight_index).shape().dim(0) == output_size);
 452   const auto batch_size = _ctx.at(output_index).shape().dim(0);
 453   const auto input_size = _ctx.at(weight_index).shape().dim(1);
 454
 455   // Check for reshaping input's shape into rank-2
 456   bool needs_reshape = false;
 457   neurun::model::Shape reshape(2);
 458   if (input_rank == 4)
 459   {
 460     const auto feature_size = _ctx.at(input_index).shape().num_elements();
 461
 462     UNUSED_RELEASE(feature_size);
 463     assert(batch_size >= 0 && input_size >= 0);
 464     assert(feature_size == static_cast<uint64_t>(batch_size) * static_cast<uint64_t>(input_size));
 465
 466     // for reshaping
 467     needs_reshape = true;
 468     reshape.dim(0) = batch_size; /* H */
 469     reshape.dim(1) = input_size; /* W */
 470   }
 471
 472   const auto activation = node.param().activation;
 473
 474   auto output_alloc = _tensor_builder->at(output_index).get();
 475   const auto input_alloc = _tensor_builder->at(input_index).get();
 476   const auto weight_alloc = _tensor_builder->at(weight_index).get();
 477   const auto bias_alloc = _tensor_builder->at(bias_index).get();
 478   const auto frontend_layout = _current_subg_layout;
 479   const auto acl_layout = output_alloc->handle()->info()->data_layout();
 480
 481   auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
 482       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 483
 484   fn->configure(
 485       input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
 486       needs_reshape,
 487       ::neurun::backend::acl_common::asTensorShape(
 488           reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)));
 489
 490   auto acl_fn = asAclFunction(std::move(fn));
 491
 492   _execution_builder->append(std::move(acl_fn));
 493
 494   ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
 495 }
 496
 497 void KernelGenerator::visit(const model::operation::Mul &node)
 498 {
 499   const auto ofm_index{node.getOutputs().at(0)};
 500   const auto lhs_index{node.getInputs().at(model::operation::Mul::Input::LHS)};
 501   const auto rhs_index{node.getInputs().at(model::operation::Mul::Input::RHS)};
 502
 503   const auto activation = node.param().activation;
 504
 505   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 506   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 507   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 508
 509   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
 510
 511   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
 512                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
 513
 514   auto acl_fn = asAclFunction(std::move(fn));
 515
 516   _execution_builder->append(std::move(acl_fn));
 517
 518   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 519 }
 520
 521 void KernelGenerator::visit(const model::operation::ReduceSum &node)
 522 {
 523   const auto output_index{node.getOutputs().at(0)};
 524   const auto input_index{node.getInputs().at(model::operation::ReduceSum::Input::INPUT)};
 525   const auto &axes{node.param().axes};
 526
 527   auto output_alloc = _tensor_builder->at(output_index).get();
 528   auto input_alloc = _tensor_builder->at(input_index).get();
 529   const auto frontend_layout = _current_subg_layout;
 530   const auto backend_layout = input_alloc->layout();
 531
 532   // Convert to ACL axes taking into account negative values and possible duplicates.
 533   std::set<std::uint32_t> acl_axes;
 534   const int input_rank = _ctx.at(input_index).shape().rank();
 535   for (int axis : axes)
 536   {
 537     if (axis < 0)
 538       axis += input_rank;
 539     acl_axes.insert(
 540         acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value());
 541   }
 542
 543   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
 544
 545   fn->configure(input_alloc->handle(), output_alloc->handle(), acl_axes,
 546                 ::arm_compute::ReduceOperation::SUM);
 547
 548   auto acl_fn = asAclFunction(std::move(fn));
 549
 550   _execution_builder->append(std::move(acl_fn));
 551 }
 552
 553 void KernelGenerator::visit(const model::operation::Reshape &node)
 554 {
 555   const auto output_index{node.getOutputs().at(0)};
 556   const auto input_index{node.getInputs().at(model::operation::Reshape::Input::INPUT)};
 557
 558   auto output_alloc = _tensor_builder->at(output_index).get();
 559   auto input_alloc = _tensor_builder->at(input_index).get();
 560
 561   // NOTE This operation must not be changed the layout from frontend to backend
 562   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 563   const auto frontend_layout = _current_subg_layout;
 564   const auto backend_layout = output_alloc->layout();
 565   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 566          frontend_layout == backend_layout);
 567   UNUSED_RELEASE(frontend_layout);
 568   UNUSED_RELEASE(backend_layout);
 569
 570   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReshapeLayer>();
 571
 572   fn->configure(input_alloc->handle(), output_alloc->handle());
 573
 574   auto acl_fn = asAclFunction(std::move(fn));
 575
 576   _execution_builder->append(std::move(acl_fn));
 577 }
 578
 579 void KernelGenerator::visit(const model::operation::Squeeze &node)
 580 {
 581   // Squeeze is identical to reshape except that it has an optional dimensions input.
 582   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 583   // by freezer and toco
 584   // TODO Support multi-layout for frontend and backend
 585   const auto output_index{node.getOutputs().at(0)};
 586   const auto input_index{node.getInputs().at(model::operation::Squeeze::Input::INPUT)};
 587   const auto dims{node.param().dims};
 588   const auto ndim{node.param().ndim};
 589   (void)dims;
 590   (void)ndim;
 591
 592   auto output_alloc = _tensor_builder->at(output_index).get();
 593   auto input_alloc = _tensor_builder->at(input_index).get();
 594   auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
 595   fn->configure(input_alloc->handle(), output_alloc->handle());
 596   auto acl_fn = asAclFunction(std::move(fn));
 597   _execution_builder->append(std::move(acl_fn));
 598 }
 599
 600 void KernelGenerator::visit(const model::operation::Tanh &node)
 601 {
 602   const auto output_index{node.getOutputs().at(0)};
 603   const auto input_index{node.getInputs().at(model::operation::Tanh::Input::INPUT)};
 604
 605   auto output_alloc = _tensor_builder->at(output_index).get();
 606   auto input_alloc = _tensor_builder->at(input_index).get();
 607
 608   auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
 609
 610   const ::arm_compute::ActivationLayerInfo act_info{
 611       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 612
 613   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
 614
 615   auto acl_fn = asAclFunction(std::move(fn));
 616
 617   _execution_builder->append(std::move(acl_fn));
 618 }
 619
 620 void KernelGenerator::visit(const model::operation::Softmax &node)
 621 {
 622   const auto output_index{node.getOutputs().at(0)};
 623   const auto input_index{node.getInputs().at(model::operation::Softmax::Input::INPUT)};
 624
 625   const auto beta = node.param().beta;
 626
 627   auto output_alloc = _tensor_builder->at(output_index).get();
 628   auto input_alloc = _tensor_builder->at(input_index).get();
 629
 630   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>(
 631       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 632
 633   fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
 634
 635   auto acl_fn = asAclFunction(std::move(fn));
 636
 637   _execution_builder->append(std::move(acl_fn));
 638 }
 639
 640 void KernelGenerator::visit(const model::operation::StridedSlice &node)
 641 {
 642   const auto output_index{node.getOutputs().at(0)};
 643   const auto input_index{node.getInputs().at(model::operation::StridedSlice::Input::INPUT)};
 644   const auto starts_index{node.getInputs().at(model::operation::StridedSlice::Input::STARTS)};
 645   const auto ends_index{node.getInputs().at(model::operation::StridedSlice::Input::ENDS)};
 646   const auto strides_index{node.getInputs().at(model::operation::StridedSlice::Input::STRIDES)};
 647
 648   auto outputData_alloc = _tensor_builder->at(output_index).get();
 649   auto inputData_alloc = _tensor_builder->at(input_index).get();
 650   const auto frontend_layout = _current_subg_layout;
 651   const auto backend_layout = inputData_alloc->layout();
 652
 653   // Set initializers for indices data such as order of inputData
 654   int input_rank = _ctx.at(input_index).shape().rank();
 655   std::vector<int32_t> starts;
 656   std::vector<int32_t> ends;
 657   std::vector<int32_t> strides;
 658   starts.resize(input_rank, 0);
 659   ends.resize(input_rank, 0);
 660   strides.resize(input_rank, 0);
 661   {
 662     auto startData_base = _ctx.at(starts_index).data().base();
 663     auto endData_base = _ctx.at(ends_index).data().base();
 664     auto stridesData_base = _ctx.at(strides_index).data().base();
 665     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 666     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 667     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 668
 669     using neurun::model::DataType;
 670
 671     UNUSED_RELEASE(startData_size);
 672     UNUSED_RELEASE(endData_size);
 673     UNUSED_RELEASE(stridesData_size);
 674
 675     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 676     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 677     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 678     assert(startData_size == input_rank);
 679     assert(endData_size == input_rank);
 680     assert(stridesData_size == input_rank);
 681
 682     assert(startData_base != nullptr);
 683     for (int n = 0; n < input_rank; ++n)
 684     {
 685       auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 686                                                                   backend_layout)
 687                       .value();
 688
 689       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 690       starts[axis] = start_value;
 691
 692       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 693       ends[axis] = end_value;
 694
 695       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 696       strides[axis] = strides_value;
 697     }
 698   }
 699
 700   // Set mask bits such as order of inputData
 701   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 702                                                            frontend_layout, backend_layout);
 703   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 704                                                          frontend_layout, backend_layout);
 705   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 706       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 707
 708   ::arm_compute::Coordinates starts_set;
 709   ::arm_compute::Coordinates ends_set;
 710   ::arm_compute::BiStrides strides_set;
 711
 712   for (size_t i = 0; i < starts.size(); ++i)
 713   {
 714     starts_set.set(i, starts[i]);
 715     ends_set.set(i, ends[i]);
 716     strides_set.set(i, strides[i]);
 717   }
 718
 719   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
 720
 721   fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
 722                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 723
 724   auto acl_fn = asAclFunction(std::move(fn));
 725
 726   _execution_builder->append(std::move(acl_fn));
 727 }
 728
 729 void KernelGenerator::visit(const model::operation::Transpose &node)
 730 {
 731   const auto ofm_idx{node.getOutputs().at(0)};
 732   const auto ifm_idx{node.getInputs().at(model::operation::Transpose::Input::INPUT)};
 733   const auto &perm{node.param().perm};
 734
 735   const auto rank = _ctx.at(ifm_idx).shape().rank();
 736
 737   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
 738   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
 739   const auto frontend_layout = _current_subg_layout;
 740   const auto backend_layout = ifm_alloc->layout();
 741
 742   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
 743   // Reversed
 744   auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(
 745       rank, pv, frontend_layout, backend_layout);
 746
 747   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
 748
 749   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
 750
 751   auto acl_fn = asAclFunction(std::move(fn));
 752
 753   _execution_builder->append(std::move(acl_fn));
 754 }
 755
 756 void KernelGenerator::visit(const model::operation::Add &node)
 757 {
 758   const auto ofm_index{node.getOutputs().at(0)};
 759   const auto lhs_index{node.getInputs().at(model::operation::Add::Input::LHS)};
 760   const auto rhs_index{node.getInputs().at(model::operation::Add::Input::RHS)};
 761
 762   const auto activation = node.param().activation;
 763
 764   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 765   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 766   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 767
 768   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
 769
 770   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
 771                 arm_compute::ConvertPolicy::SATURATE);
 772
 773   auto acl_fn = asAclFunction(std::move(fn));
 774
 775   _execution_builder->append(std::move(acl_fn));
 776
 777   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 778 }
 779
 780 void KernelGenerator::visit(const model::operation::Sub &node)
 781 {
 782   const auto ofm_index{node.getOutputs().at(0)};
 783   const auto lhs_index{node.getInputs().at(model::operation::Sub::Input::LHS)};
 784   const auto rhs_index{node.getInputs().at(model::operation::Sub::Input::RHS)};
 785
 786   const auto activation = node.param().activation;
 787
 788   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 789   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 790   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 791
 792   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
 793
 794   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
 795                 arm_compute::ConvertPolicy::SATURATE);
 796
 797   auto acl_fn = asAclFunction(std::move(fn));
 798
 799   _execution_builder->append(std::move(acl_fn));
 800
 801   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 802 }
 803
 804 void KernelGenerator::visit(const model::operation::Div &node)
 805 {
 806   const auto ofm_index{node.getOutputs().at(0)};
 807   const auto lhs_index{node.getInputs().at(model::operation::Div::Input::LHS)};
 808   const auto rhs_index{node.getInputs().at(model::operation::Div::Input::RHS)};
 809
 810   const auto activation = node.param().activation;
 811
 812   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 813   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 814   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 815
 816   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
 817
 818   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
 819
 820   auto acl_fn = asAclFunction(std::move(fn));
 821
 822   _execution_builder->append(std::move(acl_fn));
 823
 824   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 825 }
 826
 827 void KernelGenerator::visit(const model::operation::Exp &node)
 828 {
 829   const auto output_index{node.getOutputs().at(0)};
 830   const auto input_index{node.getInputs().at(model::operation::Exp::Input::INPUT)};
 831
 832   auto output_alloc = _tensor_builder->at(output_index).get();
 833   auto input_alloc = _tensor_builder->at(input_index).get();
 834
 835   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
 836
 837   fn->configure(input_alloc->handle(), output_alloc->handle());
 838
 839   auto acl_fn = asAclFunction(std::move(fn));
 840
 841   _execution_builder->append(std::move(acl_fn));
 842 }
 843
 844 void KernelGenerator::visit(const model::operation::InstanceNorm &node)
 845 {
 846   const auto ofm_index{node.getOutputs().at(0)};
 847   const auto ifm_index{node.getInputs().at(model::operation::InstanceNorm::Input::INPUT)};
 848   const auto gamma_index{node.getInputs().at(model::operation::InstanceNorm::Input::GAMMA)};
 849   const auto beta_index{node.getInputs().at(model::operation::InstanceNorm::Input::BETA)};
 850
 851   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 852   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 853   auto gamma_alloc = _tensor_builder->at(gamma_index).get();
 854   auto beta_alloc = _tensor_builder->at(beta_index).get();
 855   auto epsilon = node.param().epsilon;
 856   auto activation = node.param().activation;
 857
 858   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
 859
 860   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
 861                 beta_alloc->handle(), epsilon);
 862
 863   auto acl_fn = asAclFunction(std::move(fn));
 864
 865   _execution_builder->append(std::move(acl_fn));
 866
 867   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
 868 }
 869
 870 void KernelGenerator::visit(const model::operation::Logistic &node)
 871 {
 872   const auto ofm_index{node.getOutputs().at(0)};
 873   const auto ifm_index{node.getInputs().at(model::operation::Logistic::Input::INPUT)};
 874
 875   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 876   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 877
 878   const ::arm_compute::ActivationLayerInfo act_info{
 879       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
 880
 881   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
 882
 883   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
 884
 885   auto acl_fn = asAclFunction(std::move(fn));
 886
 887   _execution_builder->append(std::move(acl_fn));
 888 }
 889
 890 void KernelGenerator::visit(const model::operation::LogicalAnd &node)
 891 {
 892   const auto output_index{node.getOutputs().at(0)};
 893   const auto input0_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT0)};
 894   const auto input1_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT1)};
 895
 896   auto output_alloc = _tensor_builder->at(output_index).get();
 897   auto input0_alloc = _tensor_builder->at(input0_index).get();
 898   auto input1_alloc = _tensor_builder->at(input1_index).get();
 899
 900   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
 901
 902   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
 903                 ::arm_compute::BinaryLogicalOperation::AND);
 904
 905   auto acl_fn = asAclFunction(std::move(fn));
 906
 907   _execution_builder->append(std::move(acl_fn));
 908 }
 909
 910 void KernelGenerator::visit(const model::operation::LSTM &node)
 911 {
 912   // TODO Support dynamic rnn
 913   // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
 914   const auto scratch_buffer_index{
 915       node.getOutputs().at(model::operation::LSTM::Output::SCRATCH_BUFFER)};
 916   const auto output_state_out_index{
 917       node.getOutputs().at(model::operation::LSTM::Output::OUTPUT_STATE_OUT)};
 918   const auto cell_state_out_index{
 919       node.getOutputs().at(model::operation::LSTM::Output::CELL_STATE_OUT)};
 920   const auto output_index{node.getOutputs().at(model::operation::LSTM::Output::OUTPUT)};
 921
 922   const auto input_index{node.getInputs().at(model::operation::LSTM::Input::INPUT)};
 923   const auto input_to_input_weights_index{
 924       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
 925   const auto input_to_forget_weights_index{
 926       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
 927   const auto input_to_cell_weights_index{
 928       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
 929   const auto input_to_output_weights_index{
 930       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
 931   const auto recurrent_to_input_weights_index{
 932       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
 933   const auto recurrent_to_forget_weights_index{
 934       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
 935   const auto recurrent_to_cell_weights_index{
 936       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
 937   const auto recurrent_to_output_weights_index{
 938       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
 939   const auto cell_to_input_weights_index{
 940       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
 941   const auto cell_to_forget_weights_index{
 942       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
 943   const auto cell_to_output_weights_index{
 944       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
 945   const auto input_gate_bias_index{
 946       node.getInputs().at(model::operation::LSTM::Input::INPUT_GATE_BIAS)};
 947   const auto forget_gate_bias_index{
 948       node.getInputs().at(model::operation::LSTM::Input::FORGET_GATE_BIAS)};
 949   const auto cell_bias_index{node.getInputs().at(model::operation::LSTM::Input::CELL_BIAS)};
 950   const auto output_gate_bias_index{
 951       node.getInputs().at(model::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
 952   const auto projection_weights_index{
 953       node.getInputs().at(model::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
 954   const auto projection_bias_index{
 955       node.getInputs().at(model::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
 956   const auto output_state_in_index{
 957       node.getInputs().at(model::operation::LSTM::Input::OUTPUT_STATE_IN)};
 958   const auto cell_state_in_index{node.getInputs().at(model::operation::LSTM::Input::CELL_STATE_IN)};
 959   const auto cell_threshold = node.param().cell_threshold;
 960   const auto projection_threshold = node.param().projection_threshold;
 961
 962   bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
 963                                     _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
 964   bool has_recurrent_to_input_weights =
 965       _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
 966       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
 967   bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
 968   bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
 969   bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
 970                                 _ctx.at(projection_weights_index).shape().dim(1) != 0;
 971   bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
 972
 973   // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
 974   // true: no CIFG
 975   // false: CIFG
 976   // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
 977   bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
 978
 979   // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
 980   // But the cell_to_input_weights does not exist in regular CIFG although peephole.
 981   // true: peephole
 982   // false: no peephole
 983   bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
 984
 985   // NOTE Although the projection weights has data the projection bias may not have data.
 986   bool has_projection_param = has_projection_weights;
 987
 988   const auto activation = node.param().activation;
 989   const auto cell_clip = cell_threshold;
 990   const auto projection_clip = projection_threshold;
 991   assert(cell_clip >= 0.f && projection_clip >= 0.f);
 992
 993   auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
 994   auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
 995   auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
 996   auto output_alloc = _tensor_builder->at(output_index).get();
 997
 998   auto input_alloc = _tensor_builder->at(input_index).get();
 999
1000   auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
1001   auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
1002   auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
1003   auto recurrent_to_forget_weights_alloc =
1004       _tensor_builder->at(recurrent_to_forget_weights_index).get();
1005   auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
1006   auto recurrent_to_output_weights_alloc =
1007       _tensor_builder->at(recurrent_to_output_weights_index).get();
1008
1009   auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
1010   auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
1011   auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
1012   auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
1013   auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
1014
1015   auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1016
1017   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
1018
1019   ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
1020   if (has_cifg_param)
1021   {
1022     auto input_to_input_weights_alloc =
1023         _tensor_builder->at(input_to_input_weights_index).get(); // optional
1024     auto recurrent_to_input_weights_alloc =
1025         _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
1026     auto cell_to_input_weights_handle =
1027         has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
1028                            : nullptr; // optional (non-cifg && peephole)
1029     auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
1030     lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
1031                                 recurrent_to_input_weights_alloc->handle(),
1032                                 cell_to_input_weights_handle, input_gate_bias_alloc->handle());
1033   }
1034   if (has_peephole_param)
1035   {
1036     auto cell_to_forget_weights_alloc =
1037         _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
1038     auto cell_to_output_weights_alloc =
1039         _tensor_builder->at(cell_to_output_weights_index).get(); // optional
1040     lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
1041                                     cell_to_output_weights_alloc->handle());
1042   }
1043   if (has_projection_param)
1044   {
1045     auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
1046     auto projection_bias_handle = has_projection_bias
1047                                       ? _tensor_builder->at(projection_bias_index).get()->handle()
1048                                       : nullptr; // optional
1049     lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
1050   }
1051
1052   fn->configure(
1053       input_alloc->handle(), input_to_forget_weights_alloc->handle(),
1054       input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
1055       recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
1056       recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
1057       cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
1058       cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
1059       output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
1060       lstm_params, act_info, cell_clip, projection_clip);
1061
1062   auto acl_fn = asAclFunction(std::move(fn));
1063
1064   _execution_builder->append(std::move(acl_fn));
1065 }
1066
1067 void KernelGenerator::visit(const model::operation::ReduceMax &node)
1068 {
1069   const auto output_index{node.getOutputs().at(0)};
1070   const auto input_index{node.getInputs().at(model::operation::ReduceMax::Input::INPUT)};
1071   const auto &axes{node.param().axes};
1072
1073   auto ofm_alloc = _tensor_builder->at(output_index).get();
1074   auto ifm_alloc = _tensor_builder->at(input_index).get();
1075   const auto frontend_layout = _current_subg_layout;
1076   const auto backend_layout = ifm_alloc->layout();
1077
1078   // Convert to ACL axes taking into account negative values and possible duplicates.
1079   std::set<std::uint32_t> acl_axes;
1080   const int ifm_rank = _ctx.at(input_index).shape().rank();
1081   for (int axis : axes)
1082   {
1083     if (axis < 0)
1084       axis += ifm_rank;
1085     acl_axes.insert(
1086         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1087   }
1088
1089   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1090
1091   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1092                 arm_compute::ReduceOperation::MAX);
1093
1094   auto acl_fn = asAclFunction(std::move(fn));
1095
1096   _execution_builder->append(std::move(acl_fn));
1097 }
1098
1099 void KernelGenerator::visit(const model::operation::Comparison &node)
1100 {
1101   const auto output_index{node.getOutputs().at(0)};
1102   const auto input0_index{node.getInputs().at(model::operation::Comparison::Input::INPUT0)};
1103   const auto input1_index{node.getInputs().at(model::operation::Comparison::Input::INPUT1)};
1104
1105   const auto comparison_type = node.param().comparison_type;
1106
1107   auto output_alloc = _tensor_builder->at(output_index).get();
1108   auto input0_alloc = _tensor_builder->at(input0_index).get();
1109   auto input1_alloc = _tensor_builder->at(input1_index).get();
1110
1111   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
1112
1113   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
1114                 (arm_compute::ComparisonOperation)comparison_type);
1115
1116   auto acl_fn = asAclFunction(std::move(fn));
1117
1118   _execution_builder->append(std::move(acl_fn));
1119 }
1120
1121 void KernelGenerator::visit(const model::operation::Pack &node)
1122 {
1123   const auto output_index{node.getOutputs().at(0)};
1124   auto axis{node.param().axis};
1125
1126   const auto output_rank = _ctx.at(output_index).shape().rank();
1127
1128   std::vector<model::OperandIndex> input_indexes;
1129   for (const auto &input_index : node.getInputs())
1130     input_indexes.emplace_back(input_index);
1131
1132   auto output = _tensor_builder->at(output_index).get()->handle();
1133   std::vector<arm_compute::ICLTensor *> inputs;
1134   for (const auto &input_index : input_indexes)
1135     inputs.emplace_back(_tensor_builder->at(input_index)->handle());
1136
1137   const auto frontend_layout = _current_subg_layout;
1138   const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
1139
1140   if (output_rank >= 4 && _current_subg_layout != backend_layout)
1141   {
1142     throw std::runtime_error("ACL CL : Pack does not support different layouts between frontend "
1143                              "and backend in ranks above 4");
1144   }
1145
1146   if (axis < 0)
1147     axis += output_rank;
1148   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1149
1150   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStackLayer>();
1151
1152   fn->configure(inputs, axis, output);
1153
1154   _execution_builder->append(asAclFunction(std::move(fn)));
1155 }
1156
1157 void KernelGenerator::visit(const model::operation::Permute &node)
1158 {
1159   const auto ofm_idx{node.getOutputs().at(0)};
1160   const auto ifm_idx{node.getInputs().at(0)};
1161   const auto permute_type = node.getPermuteType();
1162   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
1163   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
1164   const auto rank = _ctx.at(ofm_idx).shape().rank();
1165   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
1166
1167   std::unique_ptr<::arm_compute::IFunction> fn;
1168   arm_compute::PermutationVector pv;
1169   if (permute_type == model::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
1170   {
1171     // WHCN -> CWHN
1172     pv = arm_compute::PermutationVector{2, 0, 1};
1173
1174     auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1175
1176     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1177
1178     fn = std::move(l);
1179   }
1180   else if (permute_type == model::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
1181   {
1182     // CWHN -> WHCN
1183     pv = arm_compute::PermutationVector{1, 2, 0};
1184
1185     auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1186
1187     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1188
1189     fn = std::move(l);
1190   }
1191   else
1192   {
1193     auto l = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1194
1195     l->configure(ifm_alloc->handle(), ofm_alloc->handle());
1196
1197     fn = std::move(l);
1198   }
1199
1200   auto acl_fn = asAclFunction(std::move(fn));
1201
1202   _execution_builder->append(std::move(acl_fn));
1203 }
1204
1205 void KernelGenerator::visit(const model::operation::RSQRT &node)
1206 {
1207   const auto ofm_index{node.getOutputs().at(0)};
1208   const auto ifm_index{node.getInputs().at(model::operation::RSQRT::Input::INPUT)};
1209
1210   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1211   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1212
1213   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
1214
1215   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1216
1217   _execution_builder->append(asAclFunction(std::move(fn)));
1218 }
1219
1220 void KernelGenerator::visit(const model::operation::ReLU &node)
1221 {
1222   const auto output_index{node.getOutputs().at(0)};
1223   const auto input_index{node.getInputs().at(model::operation::ReLU::Input::INPUT)};
1224
1225   auto output_alloc = _tensor_builder->at(output_index).get();
1226   auto input_alloc = _tensor_builder->at(input_index).get();
1227
1228   auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
1229
1230   const ::arm_compute::ActivationLayerInfo act_info{
1231       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
1232
1233   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1234
1235   auto acl_fn = asAclFunction(std::move(fn));
1236
1237   _execution_builder->append(std::move(acl_fn));
1238 }
1239
1240 void KernelGenerator::visit(const model::operation::ResizeBilinear &node)
1241 {
1242   const auto ofm_index{node.getOutputs().at(0)};
1243
1244   const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinear::Input::INPUT)};
1245
1246   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1247   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1248
1249   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
1250
1251   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
1252                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
1253                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1254
1255   auto acl_fn = asAclFunction(std::move(fn));
1256
1257   _execution_builder->append(std::move(acl_fn));
1258 }
1259
1260 void KernelGenerator::visit(const model::operation::ReLU1 &node)
1261 {
1262   const auto ofm_index{node.getOutputs().at(0)};
1263   const auto ifm_index{node.getInputs().at(model::operation::ReLU1::Input::INPUT)};
1264
1265   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1266   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1267
1268   const ::arm_compute::ActivationLayerInfo act_info{
1269       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
1270
1271   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1272
1273   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1274
1275   auto acl_fn = asAclFunction(std::move(fn));
1276
1277   _execution_builder->append(std::move(acl_fn));
1278 }
1279
1280 void KernelGenerator::visit(const model::operation::ReLU6 &node)
1281 {
1282   const auto ofm_index{node.getOutputs().at(0)};
1283   const auto ifm_index{node.getInputs().at(model::operation::ReLU6::Input::INPUT)};
1284
1285   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1286   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1287
1288   const ::arm_compute::ActivationLayerInfo act_info{
1289       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
1290
1291   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1292
1293   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1294
1295   auto acl_fn = asAclFunction(std::move(fn));
1296
1297   _execution_builder->append(std::move(acl_fn));
1298 }
1299
1300 void KernelGenerator::visit(const model::operation::RNN &node)
1301 {
1302   const auto output_index{node.getOutputs().at(model::operation::RNN::Output::OUTPUT)};
1303   const auto hidden_state_out_index{
1304       node.getOutputs().at(model::operation::RNN::Output::HIDDEN_STATE_OUT)};
1305
1306   const auto input_index{node.getInputs().at(model::operation::RNN::Input::INPUT)};
1307   const auto weights_index{node.getInputs().at(model::operation::RNN::Input::WEIGHTS)};
1308   const auto recurrent_weights_index{
1309       node.getInputs().at(model::operation::RNN::Input::RECURRENT_WEIGHTS)};
1310   const auto bias_index{node.getInputs().at(model::operation::RNN::Input::BIAS)};
1311   const auto hidden_state_in_index{
1312       node.getInputs().at(model::operation::RNN::Input::HIDDEN_STATE_IN)};
1313
1314   const auto activation = node.param().activation;
1315
1316   auto output_alloc = _tensor_builder->at(output_index).get();
1317   auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
1318
1319   auto input_alloc = _tensor_builder->at(input_index).get();
1320   auto weights_alloc = _tensor_builder->at(weights_index).get();
1321   auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
1322   auto bias_alloc = _tensor_builder->at(bias_index).get();
1323   auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
1324   auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1325
1326   auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1327   copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
1328   _execution_builder->append(asAclFunction(std::move(copy_layer)));
1329
1330   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>(
1331       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1332   fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
1333                 bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
1334                 act_info);
1335   _execution_builder->append(asAclFunction(std::move(fn)));
1336 }
1337
1338 void KernelGenerator::visit(const model::operation::Floor &node)
1339 {
1340   const auto ofm_index{node.getOutputs().at(0)};
1341   const auto ifm_index{node.getInputs().at(model::operation::Floor::Input::INPUT)};
1342
1343   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1344   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1345
1346   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
1347
1348   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1349
1350   auto acl_fn = asAclFunction(std::move(fn));
1351
1352   _execution_builder->append(std::move(acl_fn));
1353 }
1354
1355 void KernelGenerator::visit(const model::operation::SpaceToBatchND &node)
1356 {
1357   const auto ofm_index{node.getOutputs().at(0)};
1358   const auto ifm_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::INPUT)};
1359   const auto block_size_index{
1360       node.getInputs().at(model::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1361   const auto paddings_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::PADDINGS)};
1362
1363   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1364   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1365   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
1366   auto paddings_alloc = _tensor_builder->at(paddings_index).get();
1367
1368   assert(_ctx.at(block_size_index).isConstant());
1369   assert(_ctx.at(paddings_index).isConstant());
1370
1371   std::unique_ptr<::arm_compute::IFunction> fn;
1372   if (_ctx.at(ofm_index).typeInfo().type() == model::DataType::QUANT8_ASYMM)
1373   {
1374     // NOTE CLSpaceToBatchLayer has a bug that padding's values are 0 even when zero point of
1375     // QASYMM8 is not 0.
1376     auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchND>();
1377     l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1378                  ofm_alloc->handle());
1379     fn = std::move(l);
1380   }
1381   else
1382   {
1383     auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchLayer>();
1384     l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1385                  ofm_alloc->handle());
1386     fn = std::move(l);
1387   }
1388
1389   auto acl_fn = asAclFunction(std::move(fn));
1390
1391   _execution_builder->append(std::move(acl_fn));
1392 }
1393
1394 void KernelGenerator::visit(const model::operation::SpaceToDepth &node)
1395 {
1396   const auto ofm_index{node.getOutputs().at(0)};
1397   const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepth::Input::INPUT)};
1398
1399   auto block_size = node.param().block_size;
1400
1401   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1402   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1403
1404   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
1405
1406   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
1407
1408   auto acl_fn = asAclFunction(std::move(fn));
1409
1410   _execution_builder->append(std::move(acl_fn));
1411 }
1412
1413 void KernelGenerator::visit(const model::operation::L2Pool2D &node)
1414 {
1415   const auto ofm_index{node.getOutputs().at(0)};
1416   const auto ifm_index{node.getInputs().at(model::operation::L2Pool2D::Input::INPUT)};
1417
1418   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1419   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1420
1421   uint32_t kw = node.param().kw;
1422   uint32_t kh = node.param().kh;
1423   const auto stride = node.param().stride;
1424   const auto padding =
1425       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1426   const auto activation = node.param().activation;
1427
1428   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1429   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1430
1431   ::arm_compute::PoolingLayerInfo info{
1432       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
1433       ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
1434
1435   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
1436
1437   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
1438
1439   auto acl_fn = asAclFunction(std::move(fn));
1440
1441   _execution_builder->append(std::move(acl_fn));
1442
1443   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
1444 }
1445
1446 void KernelGenerator::visit(const model::operation::EmbeddingLookup &node)
1447 {
1448   const auto output_index{node.getOutputs().at(0)};
1449   const auto lookups_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::LOOKUPS)};
1450   const auto values_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::VALUES)};
1451
1452   auto output_alloc = _tensor_builder->at(output_index).get();
1453   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1454   auto values_alloc = _tensor_builder->at(values_index).get();
1455
1456   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
1457
1458   fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
1459
1460   auto acl_fn = asAclFunction(std::move(fn));
1461
1462   _execution_builder->append(std::move(acl_fn));
1463 }
1464
1465 void KernelGenerator::visit(const model::operation::L2Normalization &node)
1466 {
1467   const auto ofm_index{node.getOutputs().at(0)};
1468   const auto ifm_index{node.getInputs().at(model::operation::L2Normalization::Input::INPUT)};
1469
1470   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1471   // L2 Normalization always performs the reduction along the depth axis
1472   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1473   // choosing normalization parameters as below
1474
1475   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1476   // TODO Support optional constant dimension that normalization would be performed on
1477   const auto normalization_axis = ifm_shape.rank() - 1;
1478   int32_t radius =
1479       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1480   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1481   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1482   float bias = 0.0f;                             // Don't offset the reduction.
1483
1484   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1485   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1486
1487   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1488                                                                radius, alpha, beta, bias, false);
1489
1490   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1491
1492   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1493
1494   auto acl_fn = asAclFunction(std::move(fn));
1495
1496   _execution_builder->append(std::move(acl_fn));
1497 }
1498
1499 void KernelGenerator::visit(const model::operation::HashtableLookup &node)
1500 {
1501   const auto output_index{node.getOutputs().at(model::operation::HashtableLookup::Output::OUTPUT)};
1502   const auto hits_index{node.getOutputs().at(model::operation::HashtableLookup::Output::HITS)};
1503
1504   const auto lookups_index{node.getInputs().at(model::operation::HashtableLookup::Input::LOOKUPS)};
1505   const auto keys_index{node.getInputs().at(model::operation::HashtableLookup::Input::KEYS)};
1506   const auto values_index{node.getInputs().at(model::operation::HashtableLookup::Input::VALUES)};
1507
1508   auto output_alloc = _tensor_builder->at(output_index).get();
1509   auto hits_alloc = _tensor_builder->at(hits_index).get();
1510
1511   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1512   auto keys_alloc = _tensor_builder->at(keys_index).get();
1513   auto values_alloc = _tensor_builder->at(values_index).get();
1514
1515   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
1516
1517   fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
1518                 output_alloc->handle(), hits_alloc->handle());
1519
1520   auto acl_fn = asAclFunction(std::move(fn));
1521
1522   _execution_builder->append(std::move(acl_fn));
1523 }
1524
1525 void KernelGenerator::visit(const model::operation::PReLU &node)
1526 {
1527   const auto ofm_index{node.getOutputs().at(0)};
1528   const auto ifm_index{node.getInputs().at(model::operation::PReLU::Input::INPUT)};
1529   const auto alpha_index{node.getInputs().at(model::operation::PReLU::Input::ALPHA)};
1530
1531   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1532   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1533   auto alpha_alloc = _tensor_builder->at(alpha_index).get();
1534
1535   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
1536
1537   fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
1538
1539   auto acl_fn = asAclFunction(std::move(fn));
1540
1541   _execution_builder->append(std::move(acl_fn));
1542 }
1543
1544 void KernelGenerator::visit(const model::operation::TransposeConv &node)
1545 {
1546   const auto ofm_index{node.getOutputs().at(0)};
1547   const auto output_shape_index{
1548       node.getInputs().at(model::operation::TransposeConv::Input::OUTPUT_SHAPE)};
1549   const auto ker_index{node.getInputs().at(model::operation::TransposeConv::Input::KERNEL)};
1550   const auto ifm_index{node.getInputs().at(model::operation::TransposeConv::Input::INPUT)};
1551
1552   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1553   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1554   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout);
1555
1556   const auto stride = node.param().stride;
1557
1558   assert((node.param().padding.type == model::PaddingType::SAME) ||
1559          (node.param().padding.type == model::PaddingType::VALID));
1560   auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1561                                                 ker_shape.W, ker_shape.H);
1562
1563   uint32_t invalid_horizontal = 0;
1564   uint32_t invalid_vertical = 0;
1565   if (node.param().padding.type == model::PaddingType::VALID)
1566   {
1567     invalid_horizontal =
1568         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1569     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1570   }
1571
1572   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1573   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1574   auto ker_alloc = _tensor_builder->at(ker_index).get();
1575
1576   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1577
1578   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>(
1579       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1580
1581   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
1582                 invalid_horizontal, invalid_vertical);
1583
1584   auto acl_fn = asAclFunction(std::move(fn));
1585
1586   _execution_builder->append(std::move(acl_fn));
1587 }
1588
1589 void KernelGenerator::visit(const model::operation::SQRT &node)
1590 {
1591   const auto output_index{node.getOutputs().at(0)};
1592   const auto input_index{node.getInputs().at(model::operation::SQRT::Input::INPUT)};
1593
1594   auto output_alloc = _tensor_builder->at(output_index).get();
1595   auto input_alloc = _tensor_builder->at(input_index).get();
1596
1597   const ::arm_compute::ActivationLayerInfo act_info{
1598       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
1599
1600   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1601
1602   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1603
1604   auto acl_fn = asAclFunction(std::move(fn));
1605
1606   _execution_builder->append(std::move(acl_fn));
1607 }
1608
1609 void KernelGenerator::visit(const model::operation::LogicalOr &node)
1610 {
1611   const auto output_index{node.getOutputs().at(0)};
1612   const auto input0_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT0)};
1613   const auto input1_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT1)};
1614
1615   auto output_alloc = _tensor_builder->at(output_index).get();
1616   auto input0_alloc = _tensor_builder->at(input0_index).get();
1617   auto input1_alloc = _tensor_builder->at(input1_index).get();
1618
1619   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
1620
1621   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
1622
1623   auto acl_fn = asAclFunction(std::move(fn));
1624
1625   _execution_builder->append(std::move(acl_fn));
1626 }
1627
1628 void KernelGenerator::visit(const model::operation::LogicalNot &node)
1629 {
1630   const auto output_index{node.getOutputs().at(0)};
1631   const auto input_index{node.getInputs().at(model::operation::LogicalNot::Input::INPUT)};
1632
1633   auto output_alloc = _tensor_builder->at(output_index).get();
1634   auto input_alloc = _tensor_builder->at(input_index).get();
1635
1636   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
1637
1638   fn->configure(input_alloc->handle(), output_alloc->handle());
1639
1640   auto acl_fn = asAclFunction(std::move(fn));
1641
1642   _execution_builder->append(std::move(acl_fn));
1643 }
1644
1645 void KernelGenerator::visit(const model::operation::SquaredDifference &node)
1646 {
1647   const auto ofm_index{node.getOutputs().at(0)};
1648   const auto lhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::LHS)};
1649   const auto rhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::RHS)};
1650
1651   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1652   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1653   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1654
1655   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
1656
1657   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1658
1659   auto acl_fn = asAclFunction(std::move(fn));
1660
1661   _execution_builder->append(std::move(acl_fn));
1662 }
1663
1664 void KernelGenerator::visit(const model::operation::TopKV2 &node)
1665 {
1666   const auto outputValues_index{
1667       node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_VALUES)};
1668   const auto outputIndices_index{
1669       node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_INDICES)};
1670
1671   const auto inputData_index{node.getInputs().at(model::operation::TopKV2::Input::INPUT)};
1672
1673   // Currently, we only support the vector input.
1674   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1675          _ctx.at(inputData_index).shape().rank() == 2);
1676
1677   const auto k = node.param().k;
1678
1679   auto values_alloc = _tensor_builder->at(outputValues_index).get();
1680   auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
1681   auto input_alloc = _tensor_builder->at(inputData_index).get();
1682
1683   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
1684
1685   fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
1686
1687   auto acl_fn = asAclFunction(std::move(fn));
1688
1689   _execution_builder->append(std::move(acl_fn));
1690 }
1691
1692 void KernelGenerator::visit(const model::operation::Gather &node)
1693 {
1694   const auto ofm_index{node.getOutputs().at(0)};
1695
1696   const auto ifm_index{node.getInputs().at(model::operation::Gather::Input::INPUT)};
1697   const auto indices_index{node.getInputs().at(model::operation::Gather::Input::INDICES)};
1698
1699   const auto ifm_shape = _ctx.at(ifm_index).shape();
1700
1701   const auto axis_value = node.param().axis;
1702   const int axis =
1703       ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
1704
1705   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1706   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1707   auto indices_alloc = _tensor_builder->at(indices_index).get();
1708
1709   // NOTE The frontend layout and backend layout must be the same for this operation.
1710   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1711   //      is not not efficient even if it works well. If so, it would be better to set the
1712   //      layout of these backend tensors to the same layout.
1713   //      There is also one thing we have to think about. This operation depends on the layout of
1714   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1715   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1716   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1717   const auto backend_layout = ofm_alloc->layout();
1718   UNUSED_RELEASE(backend_layout);
1719   assert(backend_layout == ifm_alloc->layout());
1720   assert(backend_layout == indices_alloc->layout());
1721   assert(ifm_shape.rank() < 4 || _current_subg_layout == backend_layout);
1722
1723   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLGatherEx>();
1724
1725   fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
1726
1727   auto acl_fn = asAclFunction(std::move(fn));
1728
1729   _execution_builder->append(std::move(acl_fn));
1730 }
1731
1732 void KernelGenerator::visit(const model::operation::Neg &node)
1733 {
1734   const auto ofm_index{node.getOutputs().at(0)};
1735   const auto ifm_index{node.getInputs().at(model::operation::Neg::Input::INPUT)};
1736
1737   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1738   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1739
1740   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
1741
1742   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1743
1744   auto acl_fn = asAclFunction(std::move(fn));
1745
1746   _execution_builder->append(std::move(acl_fn));
1747 }
1748
1749 void KernelGenerator::visit(const model::operation::Abs &node)
1750 {
1751   const auto output_index{node.getOutputs().at(0)};
1752   const auto input_index{node.getInputs().at(model::operation::Abs::Input::INPUT)};
1753
1754   auto output_alloc = _tensor_builder->at(output_index).get();
1755   auto input_alloc = _tensor_builder->at(input_index).get();
1756
1757   const ::arm_compute::ActivationLayerInfo act_info{
1758       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
1759
1760   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1761
1762   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1763
1764   auto acl_fn = asAclFunction(std::move(fn));
1765
1766   _execution_builder->append(std::move(acl_fn));
1767 }
1768
1769 void KernelGenerator::visit(const model::operation::ArgMax &node)
1770 {
1771   const auto ofm_index{node.getOutputs().at(0)};
1772   const auto ifm_index{node.getInputs().at(model::operation::ArgMax::Input::INPUT)};
1773
1774   auto ifm_shape = _ctx.at(ifm_index).shape();
1775   auto ofm_shape = _ctx.at(ofm_index).shape();
1776
1777   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1778
1779   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1780   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1781   const auto ifm_rank = ifm_shape.rank();
1782   auto frontend_layout = _current_subg_layout;
1783   auto backend_layout = ifm_alloc->layout();
1784
1785   int axis_value = node.param().axis;
1786   if (axis_value < 0)
1787   {
1788     axis_value += ifm_rank;
1789   }
1790
1791   auto acl_axis =
1792       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1793
1794   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
1795
1796   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
1797                 ::arm_compute::ArgOperation::MAX);
1798
1799   auto acl_fn = asAclFunction(std::move(fn));
1800
1801   _execution_builder->append(std::move(acl_fn));
1802 }
1803
1804 void KernelGenerator::visit(const model::operation::Dequantize &node)
1805 {
1806   const auto output_index{node.getOutputs().at(0)};
1807   const auto input_index{node.getInputs().at(model::operation::Dequantize::Input::INPUT)};
1808
1809   auto output_alloc = _tensor_builder->at(output_index).get();
1810   auto input_alloc = _tensor_builder->at(input_index).get();
1811
1812   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
1813
1814   fn->configure(input_alloc->handle(), output_alloc->handle());
1815
1816   auto acl_fn = asAclFunction(std::move(fn));
1817
1818   _execution_builder->append(std::move(acl_fn));
1819 }
1820
1821 void KernelGenerator::visit(const model::operation::Mean &node)
1822 {
1823   const auto ofm_index{node.getOutputs().at(0)};
1824   const auto ifm_index{node.getInputs().at(model::operation::Mean::Input::INPUT)};
1825   const auto &axes{node.param().axes};
1826   const auto keep_dims{node.param().keep_dims};
1827
1828   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1829   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1830   const auto frontend_layout = _current_subg_layout;
1831   const auto backend_layout = ifm_alloc->layout();
1832
1833   // Convert to ACL axes taking into account negative values and possible duplicates.
1834   std::set<std::uint32_t> acl_axes;
1835   const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1836   for (int axis : axes)
1837   {
1838     if (axis < 0)
1839       axis += ifm_rank;
1840     acl_axes.insert(
1841         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1842   }
1843
1844   arm_compute::Coordinates reduce_axes;
1845   for (const auto axis : acl_axes)
1846   {
1847     reduce_axes.set(reduce_axes.num_dimensions(), axis);
1848   }
1849
1850   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceMean>();
1851
1852   fn->configure(ifm_alloc->handle(), reduce_axes, keep_dims, ofm_alloc->handle());
1853
1854   auto acl_fn = asAclFunction(std::move(fn));
1855
1856   _execution_builder->append(std::move(acl_fn));
1857 }
1858
1859 void KernelGenerator::visit(const model::operation::LocalResponseNormalization &node)
1860 {
1861   const auto ofm_index{node.getOutputs().at(0)};
1862   const auto ifm_index{
1863       node.getInputs().at(model::operation::LocalResponseNormalization::Input::INPUT)};
1864
1865   auto radius = node.param().radius;
1866   auto alpha = node.param().alpha;
1867   auto beta = node.param().beta;
1868   auto bias = node.param().bias;
1869
1870   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1871   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1872
1873   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1874       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1875
1876   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1877
1878   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1879
1880   auto acl_fn = asAclFunction(std::move(fn));
1881
1882   _execution_builder->append(std::move(acl_fn));
1883 }
1884
1885 void KernelGenerator::visit(const model::operation::DepthToSpace &node)
1886 {
1887   const auto output_index{node.getOutputs().at(0)};
1888   const auto input_index{node.getInputs().at(model::operation::DepthToSpace::Input::INPUT)};
1889
1890   auto block_size = node.param().block_size;
1891   assert(block_size > 0);
1892
1893   auto output_alloc = _tensor_builder->at(output_index).get();
1894   auto input_alloc = _tensor_builder->at(input_index).get();
1895
1896   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
1897
1898   fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
1899
1900   auto acl_fn = asAclFunction(std::move(fn));
1901
1902   _execution_builder->append(std::move(acl_fn));
1903 }
1904
1905 void KernelGenerator::visit(const model::operation::ReduceMin &node)
1906 {
1907   const auto ofm_index{node.getOutputs().at(0)};
1908   const auto ifm_index{node.getInputs().at(model::operation::ReduceMin::Input::INPUT)};
1909   const auto &axes{node.param().axes};
1910
1911   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1912   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1913   const auto frontend_layout = _current_subg_layout;
1914   const auto backend_layout = ifm_alloc->layout();
1915
1916   // Convert to ACL axes taking into account negative values and possible duplicates.
1917   std::set<std::uint32_t> acl_axes;
1918   const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1919   for (int axis : axes)
1920   {
1921     if (axis < 0)
1922       axis += ifm_rank;
1923     acl_axes.insert(
1924         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1925   }
1926
1927   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1928
1929   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1930                 ::arm_compute::ReduceOperation::MIN);
1931
1932   auto acl_fn = asAclFunction(std::move(fn));
1933
1934   _execution_builder->append(std::move(acl_fn));
1935 }
1936
1937 void KernelGenerator::visit(const model::operation::Split &node)
1938 {
1939   const auto ifm_index{node.getInputs().at(model::operation::Split::Input::INPUT)};
1940
1941   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1942
1943   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1944   std::vector<model::OperandIndex> output_indexes;
1945   for (const auto &output : node.getOutputs())
1946     output_indexes.emplace_back(output);
1947
1948   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1949   std::vector<arm_compute::ICLTensor *> output_allocs;
1950   for (const auto &ofm_ind : output_indexes)
1951     output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
1952
1953   const auto frontend_layout = _current_subg_layout;
1954   const auto backend_layout = ifm_alloc->layout();
1955   auto axis = node.param().axis;
1956   if (axis < 0)
1957     axis += ifm_rank;
1958   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1959
1960   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
1961
1962   fn->configure(ifm_alloc->handle(), output_allocs, axis);
1963
1964   _execution_builder->append(asAclFunction(std::move(fn)));
1965 }
1966
1967 void KernelGenerator::visit(const model::operation::Unpack &node)
1968 {
1969   const auto input_index{node.getInputs().at(model::operation::Unpack::Input::INPUT)};
1970   auto axis{node.param().axis};
1971
1972   const auto input_rank = _ctx.at(input_index).shape().rank();
1973
1974   std::vector<model::OperandIndex> output_indexes;
1975   for (const auto &output_index : node.getOutputs())
1976     output_indexes.emplace_back(output_index);
1977
1978   auto input = _tensor_builder->at(input_index).get()->handle();
1979   std::vector<arm_compute::ICLTensor *> outputs;
1980   for (const auto &output_index : output_indexes)
1981     outputs.emplace_back(_tensor_builder->at(output_index)->handle());
1982
1983   const auto frontend_layout = _current_subg_layout;
1984   const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1985   if (axis < 0)
1986     axis += input_rank;
1987   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1988
1989   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
1990
1991   fn->configure(input, outputs, axis);
1992
1993   _execution_builder->append(asAclFunction(std::move(fn)));
1994 }
1995
1996 void KernelGenerator::visit(const model::operation::Pad &node)
1997 {
1998   const auto input_index{node.getInputs().at(model::operation::Pad::Input::INPUT)};
1999   const auto pad_index{node.getInputs().at(model::operation::Pad::Input::PAD)};
2000   const auto output_index{node.getOutputs().at(0)};
2001   assert(_ctx.at(pad_index).isConstant());
2002
2003   auto rank = _ctx.at(pad_index).shape().dim(0);
2004   auto pad_base = _ctx.at(pad_index).data().base();
2005
2006   auto input_type = _ctx.at(input_index).typeInfo();
2007   auto data_type = acl_common::asDataType(input_type.type());
2008   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
2009   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
2010
2011   auto input = _tensor_builder->at(input_index).get()->handle();
2012   auto output = _tensor_builder->at(output_index).get()->handle();
2013
2014   ::arm_compute::PaddingList padding_list;
2015   padding_list.resize(rank);
2016   for (int32_t n = 0; n < rank; ++n)
2017   {
2018     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
2019
2020     const auto frontend_layout = _current_subg_layout;
2021     const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
2022     const auto axis =
2023         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
2024     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
2025   }
2026   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
2027   fn->configure(input, output, padding_list, pixel_value);
2028
2029   _execution_builder->append(asAclFunction(std::move(fn)));
2030 }
2031
2032 } // namespace acl_cl
2033 } // namespace backend
2034 } // namespace neurun