runtime/onert/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "exec/FunctionSequence.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34
  35 namespace onert
  36 {
  37 namespace backend
  38 {
  39 namespace acl_cl
  40 {
  41
  42 using ::onert::backend::acl_common::asAclClFunction;
  43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  44     ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclClFunction>;
  45
  46 KernelGenerator::KernelGenerator(const ir::Operands &operands_ctx,
  47                                  const ir::Operations &operations_ctx,
  48                                  const std::shared_ptr<TensorBuilder> &tensor_builder)
  49     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  50       _current_op_seq_layout(ir::Layout::UNKNOWN)
  51 {
  52   // DO NOTHING
  53 }
  54
  55 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  56 {
  57   // TODO Move this to IKernelGenerator
  58   //      (all derivatives have the same implementation for this)
  59   assert(!_return_fn_seq);
  60   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  61   _return_fn_seq->enableDynamicShapeInferer(false);
  62
  63   _current_op_seq_layout = op_seq.getLayout();
  64   for (const auto &operation_idx : op_seq.operations())
  65   {
  66     const auto &node = _operations_ctx.at(operation_idx);
  67     node.accept(*this);
  68     _return_fn_seq->append(releaseFunction());
  69   }
  70 }
  71
  72 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  73 {
  74   const auto ofm_index{node.getOutputs().at(0)};
  75   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
  76   const auto block_size_index{
  77       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
  78
  79   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
  80   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
  81   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
  82
  83   assert(_ctx.at(block_size_index).data());
  84
  85   auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
  86
  87   fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
  88
  89   auto acl_fn = asAclClFunction(std::move(fn));
  90
  91   _return_fn = std::move(acl_fn);
  92 }
  93
  94 void KernelGenerator::visit(const ir::operation::Cast &node)
  95 {
  96   const auto ofm_index{node.getOutputs().at(0)};
  97   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
  98
  99   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 100   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 101   const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
 102                                   ? arm_compute::SubDataType::BOOL
 103                                   : arm_compute::SubDataType::NONE;
 104
 105   auto fn = std::make_unique<::arm_compute::CLCast>();
 106
 107   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
 108
 109   auto acl_fn = asAclClFunction(std::move(fn));
 110
 111   _return_fn = std::move(acl_fn);
 112 }
 113
 114 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 115 {
 116   using ir::operation::Conv2D;
 117
 118   const auto ofm_index{node.getOutputs().at(0)};
 119   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 120   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 121   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 122
 123   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 124   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 125   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 126   const auto &ker_shape = _ctx.at(ker_index).shape();
 127   const auto ker_height = ker_shape.dim(1);
 128   const auto ker_width = ker_shape.dim(2);
 129
 130   const auto stride = node.param().stride;
 131   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 132                                             ker_width, ker_height);
 133   const auto activation = node.param().activation;
 134
 135   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 136   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 137   auto ker_alloc = _tensor_builder->at(ker_index).get();
 138   auto bias_alloc = _tensor_builder->at(bias_index).get();
 139
 140   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 141   const auto act_info = acl_common::asActivationLayerInfo(activation);
 142
 143   auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
 144       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 145
 146   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
 147                 conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 148
 149   _return_fn = asAclClFunction(std::move(fn));
 150 }
 151
 152 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 153 {
 154   using ir::operation::DepthwiseConv2D;
 155
 156   const auto ofm_index{node.getOutputs().at(0)};
 157   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 158   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 159   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 160
 161   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 162   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 163   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 164   const auto &ker_shape = _ctx.at(ker_index).shape();
 165   const auto ker_height = ker_shape.dim(1);
 166   const auto ker_width = ker_shape.dim(2);
 167
 168   const auto stride = node.param().stride;
 169   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 170                                             ker_width, ker_height);
 171   const auto multiplier = node.param().multiplier;
 172   const auto activation = node.param().activation;
 173
 174   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 175   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 176   auto ker_alloc = _tensor_builder->at(ker_index).get();
 177   auto bias_alloc = _tensor_builder->at(bias_index).get();
 178
 179   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 180   const auto act_info = acl_common::asActivationLayerInfo(activation);
 181
 182   {
 183     auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 184
 185     fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
 186                   ofm_alloc->handle(), conv_info, multiplier, act_info);
 187
 188     _return_fn = asAclClFunction(std::move(fn));
 189   }
 190 }
 191
 192 void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
 193 {
 194   const auto ofm_index{node.getOutputs().at(0)};
 195   const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)};
 196
 197   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 198   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 199
 200   const auto kh = node.param().kh;
 201   const auto kw = node.param().kw;
 202   const auto stride = node.param().stride;
 203   const auto padding =
 204       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 205   const auto activation = node.param().activation;
 206
 207   VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
 208   VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
 209   VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
 210   VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
 211   VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
 212   VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
 213   VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
 214   VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
 215   VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
 216   VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
 217   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
 218   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
 219
 220   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 221   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 222
 223   ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
 224                                        ::arm_compute::Size2D{kw, kh},
 225                                        acl_common::asPadStrideInfo(padding, stride)};
 226
 227   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
 228
 229   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
 230
 231   _return_fn = std::make_unique<exec::FunctionSequence>(
 232       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 233 }
 234
 235 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
 236 {
 237   const auto ofm_index{node.getOutputs().at(0)};
 238   const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)};
 239
 240   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 241   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 242
 243   const auto kh = node.param().kh;
 244   const auto kw = node.param().kw;
 245   const auto stride = node.param().stride;
 246   const auto padding =
 247       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 248   const auto activation = node.param().activation;
 249
 250   VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
 251   VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
 252   VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
 253   VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
 254   VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
 255   VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
 256   VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
 257   VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
 258   VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
 259   VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
 260   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
 261   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
 262
 263   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 264   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 265
 266   ::arm_compute::PoolingLayerInfo info{
 267       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
 268       acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
 269
 270   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
 271
 272   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
 273
 274   _return_fn = std::make_unique<exec::FunctionSequence>(
 275       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 276 }
 277
 278 void KernelGenerator::visit(const ir::operation::Concat &node)
 279 {
 280   const auto ofm_index{node.getOutputs().at(0)};
 281
 282   std::vector<ir::OperandIndex> input_indexes;
 283
 284   for (const auto &input : node.getInputs())
 285     input_indexes.emplace_back(input);
 286
 287   const auto axis = node.param().axis;
 288
 289   // Concat elimination check
 290   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 291   if (eliminated)
 292   {
 293     // If concat eliminated, return a NOP IFunction
 294     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 295     _return_fn = std::make_unique<exec::NopFunction>();
 296     return;
 297   }
 298
 299   auto output_alloc = _tensor_builder->at(ofm_index).get();
 300   std::vector<::arm_compute::ICLTensor *> input_tensors;
 301   for (auto &ifm_ind : input_indexes)
 302     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
 303
 304   std::unique_ptr<::arm_compute::IFunction> fn;
 305   if (input_indexes.size() < 2)
 306   {
 307     auto l = std::make_unique<::arm_compute::CLCopy>();
 308     l->configure(input_tensors.at(0), output_alloc->handle());
 309     fn = std::move(l);
 310   }
 311   else
 312   {
 313     auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
 314     const auto rank = _ctx.at(ofm_index).shape().rank();
 315     const auto frontend_layout = _current_op_seq_layout;
 316     const auto backend_layout = output_alloc->layout();
 317     const auto fixed_axis =
 318         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 319     l->configure(input_tensors, output_alloc->handle(), fixed_axis);
 320     fn = std::move(l);
 321   }
 322
 323   auto acl_fn = asAclClFunction(std::move(fn));
 324
 325   _return_fn = std::move(acl_fn);
 326 }
 327
 328 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 329 {
 330   using ir::operation::FullyConnected;
 331
 332   const auto output_index{node.getOutputs().at(0)};
 333   const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
 334   const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
 335   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
 336
 337   const auto input_rank = _ctx.at(input_index).shape().rank();
 338
 339   const auto output_size =
 340       _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
 341   UNUSED_RELEASE(output_size);
 342   assert(_ctx.at(bias_index).shape().dim(0) == output_size);
 343   assert(_ctx.at(weight_index).shape().dim(0) == output_size);
 344   const auto batch_size =
 345       _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
 346   const auto input_size =
 347       _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
 348
 349   // Check for reshaping input's shape into rank-2
 350   bool needs_reshape = false;
 351   ir::Shape reshape(2);
 352   if (input_rank == 3 || input_rank == 4)
 353   {
 354     const auto &ifm_shape = _ctx.at(input_index).shape();
 355     auto feature_size = 1;
 356     for (int i = 0; i < ifm_shape.rank(); ++i)
 357     {
 358       feature_size *= ifm_shape.dim(i);
 359     }
 360
 361     UNUSED_RELEASE(feature_size);
 362     assert(feature_size == batch_size * input_size);
 363
 364     // for reshaping
 365     needs_reshape = true;
 366     reshape.dim(0) = batch_size; /* H */
 367     reshape.dim(1) = input_size; /* W */
 368   }
 369
 370   const auto activation = node.param().activation;
 371
 372   auto output_alloc = _tensor_builder->at(output_index).get();
 373   const auto input_alloc = _tensor_builder->at(input_index).get();
 374   const auto weight_alloc = _tensor_builder->at(weight_index).get();
 375   const auto bias_alloc = _tensor_builder->at(bias_index).get();
 376   const auto frontend_layout = _current_op_seq_layout;
 377   const auto acl_layout = output_alloc->handle()->info()->data_layout();
 378
 379   auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
 380       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 381
 382   arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
 383       arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
 384   if (_ctx.at(weight_index).isConstant())
 385   {
 386     kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
 387     assert(_ctx.at(weight_index).data());
 388   }
 389   fn->configure(
 390       input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
 391       needs_reshape,
 392       ::onert::backend::acl_common::asTensorShape(
 393           reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
 394       kernel_type);
 395
 396   _return_fn = std::make_unique<exec::FunctionSequence>(
 397       asAclClFunction(std::move(fn)),
 398       ActivationBuilder::generate(activation, output_alloc->handle()));
 399 }
 400
 401 void KernelGenerator::visit(const ir::operation::Mul &node)
 402 {
 403   const auto ofm_index{node.getOutputs().at(0)};
 404   const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)};
 405   const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)};
 406
 407   const auto activation = node.param().activation;
 408
 409   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 410   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 411   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 412
 413   auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
 414
 415   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
 416                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
 417
 418   _return_fn = std::make_unique<exec::FunctionSequence>(
 419       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 420 }
 421
 422 void KernelGenerator::visit(const ir::operation::Reduce &node)
 423 {
 424   const auto output_index{node.getOutputs().at(0)};
 425   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 426   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 427   const auto keep_dims{node.param().keep_dims};
 428   const auto reduce_type = node.param().reduce_type;
 429
 430   auto output_alloc = _tensor_builder->at(output_index).get();
 431   auto input_alloc = _tensor_builder->at(input_index).get();
 432
 433   // Convert to ACL axes taking into account negative values and possible duplicates.
 434   const auto &axes = _ctx.at(axes_index);
 435   const auto input_rank = _ctx.at(input_index).shape().rank();
 436   const auto frontend_layout = _current_op_seq_layout;
 437   const auto backend_layout = input_alloc->layout();
 438
 439   std::unique_ptr<arm_compute::IFunction> fn;
 440   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 441   {
 442     auto l = std::make_unique<::arm_compute::CLReduceMean>();
 443
 444     const auto acl_axes =
 445         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 446     l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
 447
 448     fn = std::move(l);
 449   }
 450   else
 451   {
 452     auto l = std::make_unique<::arm_compute::CLReduceOperation>(
 453         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 454
 455     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 456     l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
 457                  acl_common::convertReduceType(reduce_type));
 458
 459     fn = std::move(l);
 460   }
 461
 462   auto acl_fn = asAclClFunction(std::move(fn));
 463
 464   _return_fn = std::move(acl_fn);
 465 }
 466
 467 void KernelGenerator::visit(const ir::operation::Reshape &node)
 468 {
 469   const auto output_index{node.getOutputs().at(0)};
 470   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 471
 472   auto output_alloc = _tensor_builder->at(output_index).get();
 473   auto input_alloc = _tensor_builder->at(input_index).get();
 474
 475   // NOTE This operation must not be changed the layout from frontend to backend
 476   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 477   const auto frontend_layout = _current_op_seq_layout;
 478   const auto backend_layout = output_alloc->layout();
 479   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 480          frontend_layout == backend_layout);
 481   UNUSED_RELEASE(frontend_layout);
 482   UNUSED_RELEASE(backend_layout);
 483
 484   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 485
 486   fn->configure(input_alloc->handle(), output_alloc->handle());
 487
 488   auto acl_fn = asAclClFunction(std::move(fn));
 489
 490   _return_fn = std::move(acl_fn);
 491 }
 492
 493 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 494 {
 495   // Squeeze is identical to reshape except that it has an optional dimensions input.
 496   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 497   // by freezer and toco
 498   // TODO Support multi-layout for frontend and backend
 499   const auto output_index{node.getOutputs().at(0)};
 500   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 501   const auto dims{node.param().dims};
 502   const auto ndim{node.param().ndim};
 503   (void)dims;
 504   (void)ndim;
 505
 506   auto output_alloc = _tensor_builder->at(output_index).get();
 507   auto input_alloc = _tensor_builder->at(input_index).get();
 508   auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
 509   fn->configure(input_alloc->handle(), output_alloc->handle());
 510   auto acl_fn = asAclClFunction(std::move(fn));
 511   _return_fn = std::move(acl_fn);
 512 }
 513
 514 void KernelGenerator::visit(const ir::operation::Tanh &node)
 515 {
 516   const auto output_index{node.getOutputs().at(0)};
 517   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 518
 519   auto output_alloc = _tensor_builder->at(output_index).get();
 520   auto input_alloc = _tensor_builder->at(input_index).get();
 521
 522   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
 523
 524   const ::arm_compute::ActivationLayerInfo act_info{
 525       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 526
 527   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
 528
 529   auto acl_fn = asAclClFunction(std::move(fn));
 530
 531   _return_fn = std::move(acl_fn);
 532 }
 533
 534 void KernelGenerator::visit(const ir::operation::Softmax &node)
 535 {
 536   const auto output_index{node.getOutputs().at(0)};
 537   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 538
 539   const auto beta = node.param().beta;
 540
 541   auto output_alloc = _tensor_builder->at(output_index).get();
 542   auto input_alloc = _tensor_builder->at(input_index).get();
 543
 544   auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
 545       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 546
 547   fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
 548
 549   auto acl_fn = asAclClFunction(std::move(fn));
 550
 551   _return_fn = std::move(acl_fn);
 552 }
 553
 554 void KernelGenerator::visit(const ir::operation::Slice &node)
 555 {
 556   const auto output_index{node.getOutputs().at(0)};
 557   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
 558   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
 559   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 560
 561   auto outputData_alloc = _tensor_builder->at(output_index).get();
 562   auto inputData_alloc = _tensor_builder->at(input_index).get();
 563   const auto frontend_layout = _current_op_seq_layout;
 564   const auto backend_layout = inputData_alloc->layout();
 565
 566   // Set initializers for indices data such as order of inputData
 567   int input_rank = _ctx.at(input_index).shape().rank();
 568   std::vector<int32_t> starts;
 569   std::vector<int32_t> ends;
 570   starts.resize(input_rank, 0);
 571   ends.resize(input_rank, 0);
 572   {
 573     assert(_ctx.at(begins_index).data());
 574     assert(_ctx.at(sizes_index).data());
 575     auto beginData_base = _ctx.at(begins_index).data()->base();
 576     auto sizeData_base = _ctx.at(sizes_index).data()->base();
 577     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
 578     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
 579
 580     using ir::DataType;
 581
 582     UNUSED_RELEASE(beginData_size);
 583     UNUSED_RELEASE(sizeData_size);
 584
 585     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
 586     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
 587     assert(beginData_size == input_rank);
 588     assert(sizeData_size == input_rank);
 589
 590     assert(beginData_base != nullptr);
 591     for (int n = 0; n < input_rank; ++n)
 592     {
 593       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 594                                                                  backend_layout)
 595                       .value();
 596
 597       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
 598       starts[axis] = begin_value;
 599
 600       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
 601       ends[axis] = begin_value + size_value;
 602     }
 603   }
 604
 605   ::arm_compute::Coordinates starts_set;
 606   ::arm_compute::Coordinates ends_set;
 607
 608   for (size_t i = 0; i < starts.size(); ++i)
 609   {
 610     starts_set.set(i, starts[i]);
 611     ends_set.set(i, ends[i]);
 612   }
 613
 614   auto fn = std::make_unique<::arm_compute::CLSlice>();
 615
 616   fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
 617
 618   auto acl_fn = asAclClFunction(std::move(fn));
 619
 620   _return_fn = std::move(acl_fn);
 621 }
 622
 623 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 624 {
 625   const auto output_index{node.getOutputs().at(0)};
 626   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 627   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
 628   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
 629   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 630
 631   auto outputData_alloc = _tensor_builder->at(output_index).get();
 632   auto inputData_alloc = _tensor_builder->at(input_index).get();
 633   const auto frontend_layout = _current_op_seq_layout;
 634   const auto backend_layout = inputData_alloc->layout();
 635
 636   // Set initializers for indices data such as order of inputData
 637   int input_rank = _ctx.at(input_index).shape().rank();
 638   std::vector<int32_t> starts;
 639   std::vector<int32_t> ends;
 640   std::vector<int32_t> strides;
 641   starts.resize(input_rank, 0);
 642   ends.resize(input_rank, 0);
 643   strides.resize(input_rank, 0);
 644   {
 645     assert(_ctx.at(starts_index).data());
 646     assert(_ctx.at(ends_index).data());
 647     assert(_ctx.at(strides_index).data());
 648     auto startData_base = _ctx.at(starts_index).data()->base();
 649     auto endData_base = _ctx.at(ends_index).data()->base();
 650     auto stridesData_base = _ctx.at(strides_index).data()->base();
 651     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 652     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 653     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 654
 655     using ir::DataType;
 656
 657     UNUSED_RELEASE(startData_size);
 658     UNUSED_RELEASE(endData_size);
 659     UNUSED_RELEASE(stridesData_size);
 660
 661     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 662     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 663     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 664     assert(startData_size == input_rank);
 665     assert(endData_size == input_rank);
 666     assert(stridesData_size == input_rank);
 667
 668     assert(startData_base != nullptr);
 669     for (int n = 0; n < input_rank; ++n)
 670     {
 671       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 672                                                                  backend_layout)
 673                       .value();
 674
 675       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 676       starts[axis] = start_value;
 677
 678       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 679       ends[axis] = end_value;
 680
 681       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 682       strides[axis] = strides_value;
 683     }
 684   }
 685
 686   // Set mask bits such as order of inputData
 687   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 688                                                            frontend_layout, backend_layout);
 689   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 690                                                          frontend_layout, backend_layout);
 691   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 692       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 693
 694   ::arm_compute::Coordinates starts_set;
 695   ::arm_compute::Coordinates ends_set;
 696   ::arm_compute::BiStrides strides_set;
 697
 698   for (size_t i = 0; i < starts.size(); ++i)
 699   {
 700     starts_set.set(i, starts[i]);
 701     ends_set.set(i, ends[i]);
 702     strides_set.set(i, strides[i]);
 703   }
 704
 705   auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
 706
 707   fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
 708                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 709
 710   auto acl_fn = asAclClFunction(std::move(fn));
 711
 712   _return_fn = std::move(acl_fn);
 713 }
 714
 715 void KernelGenerator::visit(const ir::operation::Transpose &node)
 716 {
 717   const auto ofm_idx{node.getOutputs().at(0)};
 718   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 719   const auto &perm{node.param().perm};
 720
 721   const auto rank = _ctx.at(ifm_idx).shape().rank();
 722
 723   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
 724   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
 725   const auto frontend_layout = _current_op_seq_layout;
 726   const auto backend_layout = ifm_alloc->layout();
 727
 728   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
 729   // Reversed
 730   auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
 731       rank, pv, frontend_layout, backend_layout);
 732
 733   auto fn = std::make_unique<::arm_compute::CLPermute>();
 734
 735   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
 736
 737   auto acl_fn = asAclClFunction(std::move(fn));
 738
 739   _return_fn = std::move(acl_fn);
 740 }
 741
 742 void KernelGenerator::visit(const ir::operation::Add &node)
 743 {
 744   const auto ofm_index{node.getOutputs().at(0)};
 745   const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)};
 746   const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)};
 747
 748   const auto activation = node.param().activation;
 749
 750   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 751   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 752   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 753
 754   auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
 755
 756   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
 757                 arm_compute::ConvertPolicy::SATURATE);
 758
 759   _return_fn = std::make_unique<exec::FunctionSequence>(
 760       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 761 }
 762
 763 void KernelGenerator::visit(const ir::operation::Sub &node)
 764 {
 765   const auto ofm_index{node.getOutputs().at(0)};
 766   const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)};
 767   const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)};
 768
 769   const auto activation = node.param().activation;
 770
 771   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 772   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 773   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 774
 775   auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
 776
 777   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
 778                 arm_compute::ConvertPolicy::SATURATE);
 779
 780   _return_fn = std::make_unique<exec::FunctionSequence>(
 781       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 782 }
 783
 784 void KernelGenerator::visit(const ir::operation::Div &node)
 785 {
 786   const auto ofm_index{node.getOutputs().at(0)};
 787   const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)};
 788   const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)};
 789
 790   const auto activation = node.param().activation;
 791
 792   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 793   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
 794   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
 795
 796   auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
 797
 798   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
 799
 800   _return_fn = std::make_unique<exec::FunctionSequence>(
 801       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 802 }
 803
 804 void KernelGenerator::visit(const ir::operation::Exp &node)
 805 {
 806   const auto output_index{node.getOutputs().at(0)};
 807   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 808
 809   auto output_alloc = _tensor_builder->at(output_index).get();
 810   auto input_alloc = _tensor_builder->at(input_index).get();
 811
 812   auto fn = std::make_unique<::arm_compute::CLExpLayer>();
 813
 814   fn->configure(input_alloc->handle(), output_alloc->handle());
 815
 816   auto acl_fn = asAclClFunction(std::move(fn));
 817
 818   _return_fn = std::move(acl_fn);
 819 }
 820
 821 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 822 {
 823   const auto output_index{node.getOutputs().at(0)};
 824   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 825
 826   auto output_alloc = _tensor_builder->at(output_index).get();
 827   auto input_alloc = _tensor_builder->at(input_index).get();
 828
 829   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 830
 831   fn->configure(input_alloc->handle(), output_alloc->handle());
 832
 833   auto acl_fn = asAclClFunction(std::move(fn));
 834
 835   _return_fn = std::move(acl_fn);
 836 }
 837
 838 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 839 {
 840   const auto ofm_index{node.getOutputs().at(0)};
 841   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 842   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 843   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 844
 845   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 846   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 847   auto gamma_alloc = _tensor_builder->at(gamma_index).get();
 848   auto beta_alloc = _tensor_builder->at(beta_index).get();
 849   auto epsilon = node.param().epsilon;
 850   auto activation = node.param().activation;
 851
 852   auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
 853
 854   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
 855                 beta_alloc->handle(), epsilon);
 856
 857   _return_fn = std::make_unique<exec::FunctionSequence>(
 858       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
 859 }
 860
 861 void KernelGenerator::visit(const ir::operation::Logistic &node)
 862 {
 863   const auto ofm_index{node.getOutputs().at(0)};
 864   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 865
 866   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
 867   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
 868
 869   const ::arm_compute::ActivationLayerInfo act_info{
 870       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
 871
 872   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 873
 874   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
 875
 876   auto acl_fn = asAclClFunction(std::move(fn));
 877
 878   _return_fn = std::move(acl_fn);
 879 }
 880
 881 void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
 882 {
 883   const auto output_index{node.getOutputs().at(0)};
 884   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
 885   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
 886
 887   auto output_alloc = _tensor_builder->at(output_index).get();
 888   auto input0_alloc = _tensor_builder->at(input0_index).get();
 889   auto input1_alloc = _tensor_builder->at(input1_index).get();
 890
 891   auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
 892
 893   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
 894                 ::arm_compute::BinaryLogicalOperation::AND);
 895
 896   auto acl_fn = asAclClFunction(std::move(fn));
 897
 898   _return_fn = std::move(acl_fn);
 899 }
 900
 901 void KernelGenerator::visit(const ir::operation::LSTM &node)
 902 {
 903   // TODO Support dynamic rnn
 904   // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
 905   const auto scratch_buffer_index{
 906       node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
 907   const auto output_state_out_index{
 908       node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
 909   const auto cell_state_out_index{
 910       node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
 911   const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
 912
 913   const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
 914   const auto input_to_input_weights_index{
 915       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
 916   const auto input_to_forget_weights_index{
 917       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
 918   const auto input_to_cell_weights_index{
 919       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
 920   const auto input_to_output_weights_index{
 921       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
 922   const auto recurrent_to_input_weights_index{
 923       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
 924   const auto recurrent_to_forget_weights_index{
 925       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
 926   const auto recurrent_to_cell_weights_index{
 927       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
 928   const auto recurrent_to_output_weights_index{
 929       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
 930   const auto cell_to_input_weights_index{
 931       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
 932   const auto cell_to_forget_weights_index{
 933       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
 934   const auto cell_to_output_weights_index{
 935       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
 936   const auto input_gate_bias_index{
 937       node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
 938   const auto forget_gate_bias_index{
 939       node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
 940   const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
 941   const auto output_gate_bias_index{
 942       node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
 943   const auto projection_weights_index{
 944       node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
 945   const auto projection_bias_index{
 946       node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
 947   const auto output_state_in_index{
 948       node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
 949   const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
 950   const auto cell_threshold = node.param().cell_threshold;
 951   const auto projection_threshold = node.param().projection_threshold;
 952
 953   bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
 954                                     _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
 955   bool has_recurrent_to_input_weights =
 956       _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
 957       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
 958   bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
 959   bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
 960   bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
 961                                 _ctx.at(projection_weights_index).shape().dim(1) != 0;
 962   bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
 963
 964   // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
 965   // true: no CIFG
 966   // false: CIFG
 967   // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
 968   bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
 969
 970   // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
 971   // But the cell_to_input_weights does not exist in regular CIFG although peephole.
 972   // true: peephole
 973   // false: no peephole
 974   bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
 975
 976   // NOTE Although the projection weights has data the projection bias may not have data.
 977   bool has_projection_param = has_projection_weights;
 978
 979   const auto activation = node.param().activation;
 980   const auto cell_clip = cell_threshold;
 981   const auto projection_clip = projection_threshold;
 982   assert(cell_clip >= 0.f && projection_clip >= 0.f);
 983
 984   auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
 985   auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
 986   auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
 987   auto output_alloc = _tensor_builder->at(output_index).get();
 988
 989   auto input_alloc = _tensor_builder->at(input_index).get();
 990
 991   auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
 992   auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
 993   auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
 994   auto recurrent_to_forget_weights_alloc =
 995       _tensor_builder->at(recurrent_to_forget_weights_index).get();
 996   auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
 997   auto recurrent_to_output_weights_alloc =
 998       _tensor_builder->at(recurrent_to_output_weights_index).get();
 999
1000   auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
1001   auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
1002   auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
1003   auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
1004   auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
1005
1006   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1007
1008   auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
1009
1010   ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
1011   if (has_cifg_param)
1012   {
1013     auto input_to_input_weights_alloc =
1014         _tensor_builder->at(input_to_input_weights_index).get(); // optional
1015     auto recurrent_to_input_weights_alloc =
1016         _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
1017     auto cell_to_input_weights_handle =
1018         has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
1019                            : nullptr; // optional (non-cifg && peephole)
1020     auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
1021     lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
1022                                 recurrent_to_input_weights_alloc->handle(),
1023                                 cell_to_input_weights_handle, input_gate_bias_alloc->handle());
1024   }
1025   if (has_peephole_param)
1026   {
1027     auto cell_to_forget_weights_alloc =
1028         _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
1029     auto cell_to_output_weights_alloc =
1030         _tensor_builder->at(cell_to_output_weights_index).get(); // optional
1031     lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
1032                                     cell_to_output_weights_alloc->handle());
1033   }
1034   if (has_projection_param)
1035   {
1036     auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
1037     auto projection_bias_handle = has_projection_bias
1038                                       ? _tensor_builder->at(projection_bias_index).get()->handle()
1039                                       : nullptr; // optional
1040     lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
1041   }
1042
1043   fn->configure(
1044       input_alloc->handle(), input_to_forget_weights_alloc->handle(),
1045       input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
1046       recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
1047       recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
1048       cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
1049       cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
1050       output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
1051       lstm_params, act_info, cell_clip, projection_clip);
1052
1053   auto acl_fn = asAclClFunction(std::move(fn));
1054
1055   _return_fn = std::move(acl_fn);
1056 }
1057
1058 void KernelGenerator::visit(const ir::operation::Comparison &node)
1059 {
1060   const auto output_index{node.getOutputs().at(0)};
1061   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1062   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1063
1064   const auto comparison_type = node.param().comparison_type;
1065
1066   auto output_alloc = _tensor_builder->at(output_index).get();
1067   auto input0_alloc = _tensor_builder->at(input0_index).get();
1068   auto input1_alloc = _tensor_builder->at(input1_index).get();
1069
1070   auto fn = std::make_unique<::arm_compute::CLComparison>();
1071
1072   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
1073                 (arm_compute::ComparisonOperation)comparison_type);
1074
1075   auto acl_fn = asAclClFunction(std::move(fn));
1076
1077   _return_fn = std::move(acl_fn);
1078 }
1079
1080 void KernelGenerator::visit(const ir::operation::Pack &node)
1081 {
1082   const auto output_index{node.getOutputs().at(0)};
1083   auto axis{node.param().axis};
1084
1085   const auto output_rank = _ctx.at(output_index).shape().rank();
1086
1087   std::vector<ir::OperandIndex> input_indexes;
1088   for (const auto &input_index : node.getInputs())
1089     input_indexes.emplace_back(input_index);
1090
1091   auto output = _tensor_builder->at(output_index).get()->handle();
1092   std::vector<arm_compute::ICLTensor *> inputs;
1093   for (const auto &input_index : input_indexes)
1094     inputs.emplace_back(_tensor_builder->at(input_index)->handle());
1095
1096   const auto frontend_layout = _current_op_seq_layout;
1097   const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
1098
1099   if (axis < 0)
1100     axis += output_rank;
1101   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1102
1103   auto fn = std::make_unique<::arm_compute::CLStackLayer>();
1104
1105   // Disable applied dim_correction
1106   std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
1107   for (const auto &input_index : input_indexes)
1108   {
1109     size_t input_rank = _ctx.at(input_index).shape().rank();
1110     const auto &input_alloc = _tensor_builder->at(input_index);
1111     orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
1112     assert(input_rank == input_alloc->num_dimensions());
1113     if (input_rank != input_alloc->info()->num_dimensions())
1114     {
1115       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1116       input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1117           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
1118     }
1119   }
1120
1121   fn->configure(inputs, axis, output);
1122
1123   // Revert disabling applied dim_correction
1124   assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
1125   for (size_t i = 0; i < inputs.size(); ++i)
1126   {
1127     inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
1128   }
1129
1130   _return_fn = asAclClFunction(std::move(fn));
1131 }
1132
1133 void KernelGenerator::visit(const ir::operation::Permute &node)
1134 {
1135   const auto ofm_idx{node.getOutputs().at(0)};
1136   const auto ifm_idx{node.getInputs().at(0)};
1137   const auto permute_type = node.getPermuteType();
1138   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
1139   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
1140   const auto rank = _ctx.at(ofm_idx).shape().rank();
1141   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
1142
1143   std::unique_ptr<::arm_compute::IFunction> fn;
1144   arm_compute::PermutationVector pv;
1145   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
1146   {
1147     // WHCN -> CWHN
1148     pv = arm_compute::PermutationVector{2, 0, 1};
1149
1150     auto l = std::make_unique<::arm_compute::CLPermute>();
1151
1152     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1153
1154     fn = std::move(l);
1155   }
1156   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
1157   {
1158     // CWHN -> WHCN
1159     pv = arm_compute::PermutationVector{1, 2, 0};
1160
1161     auto l = std::make_unique<::arm_compute::CLPermute>();
1162
1163     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1164
1165     fn = std::move(l);
1166   }
1167   else
1168   {
1169     auto l = std::make_unique<::arm_compute::CLCopy>();
1170
1171     l->configure(ifm_alloc->handle(), ofm_alloc->handle());
1172
1173     fn = std::move(l);
1174   }
1175
1176   auto acl_fn = asAclClFunction(std::move(fn));
1177
1178   _return_fn = std::move(acl_fn);
1179 }
1180
1181 void KernelGenerator::visit(const ir::operation::RSQRT &node)
1182 {
1183   const auto ofm_index{node.getOutputs().at(0)};
1184   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
1185
1186   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1187   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1188
1189   auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
1190
1191   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1192
1193   _return_fn = asAclClFunction(std::move(fn));
1194 }
1195
1196 void KernelGenerator::visit(const ir::operation::ReLU &node)
1197 {
1198   const auto output_index{node.getOutputs().at(0)};
1199   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
1200
1201   auto output_alloc = _tensor_builder->at(output_index).get();
1202   auto input_alloc = _tensor_builder->at(input_index).get();
1203
1204   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
1205
1206   const ::arm_compute::ActivationLayerInfo act_info{
1207       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
1208
1209   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1210
1211   auto acl_fn = asAclClFunction(std::move(fn));
1212
1213   _return_fn = std::move(acl_fn);
1214 }
1215
1216 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
1217 {
1218   const auto ofm_index{node.getOutputs().at(0)};
1219
1220   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
1221
1222   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1223   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1224
1225   auto fn = std::make_unique<::arm_compute::CLScale>();
1226
1227   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
1228                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
1229                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1230
1231   auto acl_fn = asAclClFunction(std::move(fn));
1232
1233   _return_fn = std::move(acl_fn);
1234 }
1235
1236 void KernelGenerator::visit(const ir::operation::ReLU1 &node)
1237 {
1238   const auto ofm_index{node.getOutputs().at(0)};
1239   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
1240
1241   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1242   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1243
1244   const ::arm_compute::ActivationLayerInfo act_info{
1245       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
1246
1247   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1248
1249   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1250
1251   auto acl_fn = asAclClFunction(std::move(fn));
1252
1253   _return_fn = std::move(acl_fn);
1254 }
1255
1256 void KernelGenerator::visit(const ir::operation::ReLU6 &node)
1257 {
1258   const auto ofm_index{node.getOutputs().at(0)};
1259   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
1260
1261   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1262   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1263
1264   const ::arm_compute::ActivationLayerInfo act_info{
1265       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
1266
1267   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1268
1269   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1270
1271   auto acl_fn = asAclClFunction(std::move(fn));
1272
1273   _return_fn = std::move(acl_fn);
1274 }
1275
1276 void KernelGenerator::visit(const ir::operation::RNN &node)
1277 {
1278   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1279   const auto hidden_state_out_index{
1280       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1281
1282   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1283   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1284   const auto recurrent_weights_index{
1285       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1286   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1287   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1288
1289   const auto activation = node.param().activation;
1290
1291   auto output_alloc = _tensor_builder->at(output_index).get();
1292   auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
1293
1294   auto input_alloc = _tensor_builder->at(input_index).get();
1295   auto weights_alloc = _tensor_builder->at(weights_index).get();
1296   auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
1297   auto bias_alloc = _tensor_builder->at(bias_index).get();
1298   auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
1299   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1300
1301   auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
1302   copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
1303   _return_fn = asAclClFunction(std::move(copy_layer));
1304
1305   auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
1306       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1307   fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
1308                 bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
1309                 act_info);
1310   _return_fn = asAclClFunction(std::move(fn));
1311 }
1312
1313 void KernelGenerator::visit(const ir::operation::Floor &node)
1314 {
1315   const auto ofm_index{node.getOutputs().at(0)};
1316   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
1317
1318   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1319   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1320
1321   auto fn = std::make_unique<::arm_compute::CLFloor>();
1322
1323   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1324
1325   auto acl_fn = asAclClFunction(std::move(fn));
1326
1327   _return_fn = std::move(acl_fn);
1328 }
1329
1330 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1331 {
1332   const auto ofm_index{node.getOutputs().at(0)};
1333   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1334   const auto block_size_index{
1335       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1336   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1337
1338   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1339   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1340   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
1341   auto paddings_alloc = _tensor_builder->at(paddings_index).get();
1342
1343   assert(_ctx.at(block_size_index).data());
1344   assert(_ctx.at(paddings_index).data());
1345
1346   std::unique_ptr<::arm_compute::IFunction> fn;
1347
1348   auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
1349   l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1350                ofm_alloc->handle());
1351   fn = std::move(l);
1352
1353   auto acl_fn = asAclClFunction(std::move(fn));
1354
1355   _return_fn = std::move(acl_fn);
1356 }
1357
1358 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1359 {
1360   const auto ofm_index{node.getOutputs().at(0)};
1361   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1362
1363   auto block_size = node.param().block_size;
1364
1365   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1366   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1367
1368   auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
1369
1370   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
1371
1372   auto acl_fn = asAclClFunction(std::move(fn));
1373
1374   _return_fn = std::move(acl_fn);
1375 }
1376
1377 void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
1378 {
1379   const auto ofm_index{node.getOutputs().at(0)};
1380   const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)};
1381
1382   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1383   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1384
1385   uint32_t kw = node.param().kw;
1386   uint32_t kh = node.param().kh;
1387   const auto stride = node.param().stride;
1388   const auto padding =
1389       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1390   const auto activation = node.param().activation;
1391
1392   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1393   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1394
1395   ::arm_compute::PoolingLayerInfo info{
1396       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
1397       ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
1398
1399   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
1400
1401   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
1402
1403   _return_fn = std::make_unique<exec::FunctionSequence>(
1404       asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
1405 }
1406
1407 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1408 {
1409   const auto output_index{node.getOutputs().at(0)};
1410   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1411   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1412
1413   auto output_alloc = _tensor_builder->at(output_index).get();
1414   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1415   auto values_alloc = _tensor_builder->at(values_index).get();
1416
1417   auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
1418
1419   fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
1420
1421   auto acl_fn = asAclClFunction(std::move(fn));
1422
1423   _return_fn = std::move(acl_fn);
1424 }
1425
1426 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1427 {
1428   const auto ofm_index{node.getOutputs().at(0)};
1429   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1430
1431   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1432   // L2 Normalization always performs the reduction along the depth axis
1433   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1434   // choosing normalization parameters as below
1435
1436   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1437   // TODO Support optional constant dimension that normalization would be performed on
1438   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1439   int32_t radius =
1440       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1441   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1442   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1443   float bias = 0.0f;                             // Don't offset the reduction.
1444
1445   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1446   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1447
1448   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1449                                                                radius, alpha, beta, bias, false);
1450
1451   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
1452
1453   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1454
1455   auto acl_fn = asAclClFunction(std::move(fn));
1456
1457   _return_fn = std::move(acl_fn);
1458 }
1459
1460 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1461 {
1462   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1463   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1464
1465   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1466   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1467   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1468
1469   auto output_alloc = _tensor_builder->at(output_index).get();
1470   auto hits_alloc = _tensor_builder->at(hits_index).get();
1471
1472   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1473   auto keys_alloc = _tensor_builder->at(keys_index).get();
1474   auto values_alloc = _tensor_builder->at(values_index).get();
1475
1476   auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
1477
1478   fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
1479                 output_alloc->handle(), hits_alloc->handle());
1480
1481   auto acl_fn = asAclClFunction(std::move(fn));
1482
1483   _return_fn = std::move(acl_fn);
1484 }
1485
1486 void KernelGenerator::visit(const ir::operation::PReLU &node)
1487 {
1488   const auto ofm_index{node.getOutputs().at(0)};
1489   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1490   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1491
1492   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1493   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1494   auto alpha_alloc = _tensor_builder->at(alpha_index).get();
1495
1496   auto fn = std::make_unique<::arm_compute::CLPReLU>();
1497
1498   fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
1499
1500   auto acl_fn = asAclClFunction(std::move(fn));
1501
1502   _return_fn = std::move(acl_fn);
1503 }
1504
1505 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1506 {
1507   const auto ofm_index{node.getOutputs().at(0)};
1508   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1509   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1510
1511   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1512   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1513   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1514
1515   const auto stride = node.param().stride;
1516
1517   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1518          (node.param().padding.type == ir::PaddingType::VALID));
1519   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1520                                       ker_shape.W, ker_shape.H);
1521
1522   uint32_t invalid_horizontal = 0;
1523   uint32_t invalid_vertical = 0;
1524   if (node.param().padding.type == ir::PaddingType::VALID)
1525   {
1526     invalid_horizontal =
1527         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1528     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1529   }
1530
1531   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1532   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1533   auto ker_alloc = _tensor_builder->at(ker_index).get();
1534
1535   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1536
1537   auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
1538       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1539
1540   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
1541                 invalid_horizontal, invalid_vertical);
1542
1543   auto acl_fn = asAclClFunction(std::move(fn));
1544
1545   _return_fn = std::move(acl_fn);
1546 }
1547
1548 void KernelGenerator::visit(const ir::operation::SQRT &node)
1549 {
1550   const auto output_index{node.getOutputs().at(0)};
1551   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
1552
1553   auto output_alloc = _tensor_builder->at(output_index).get();
1554   auto input_alloc = _tensor_builder->at(input_index).get();
1555
1556   const ::arm_compute::ActivationLayerInfo act_info{
1557       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
1558
1559   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1560
1561   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1562
1563   auto acl_fn = asAclClFunction(std::move(fn));
1564
1565   _return_fn = std::move(acl_fn);
1566 }
1567
1568 void KernelGenerator::visit(const ir::operation::LogicalOr &node)
1569 {
1570   const auto output_index{node.getOutputs().at(0)};
1571   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
1572   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
1573
1574   auto output_alloc = _tensor_builder->at(output_index).get();
1575   auto input0_alloc = _tensor_builder->at(input0_index).get();
1576   auto input1_alloc = _tensor_builder->at(input1_index).get();
1577
1578   auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
1579
1580   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
1581
1582   auto acl_fn = asAclClFunction(std::move(fn));
1583
1584   _return_fn = std::move(acl_fn);
1585 }
1586
1587 void KernelGenerator::visit(const ir::operation::LogicalNot &node)
1588 {
1589   const auto output_index{node.getOutputs().at(0)};
1590   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
1591
1592   auto output_alloc = _tensor_builder->at(output_index).get();
1593   auto input_alloc = _tensor_builder->at(input_index).get();
1594
1595   auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
1596
1597   fn->configure(input_alloc->handle(), output_alloc->handle());
1598
1599   auto acl_fn = asAclClFunction(std::move(fn));
1600
1601   _return_fn = std::move(acl_fn);
1602 }
1603
1604 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1605 {
1606   const auto ofm_index{node.getOutputs().at(0)};
1607   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1608   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1609
1610   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1611   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1612   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1613
1614   auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
1615
1616   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1617
1618   auto acl_fn = asAclClFunction(std::move(fn));
1619
1620   _return_fn = std::move(acl_fn);
1621 }
1622
1623 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1624 {
1625   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1626   const auto outputIndices_index{
1627       node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1628
1629   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1630
1631   // Currently, we only support the vector input.
1632   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1633          _ctx.at(inputData_index).shape().rank() == 2);
1634
1635   const auto k = node.param().k;
1636
1637   auto values_alloc = _tensor_builder->at(outputValues_index).get();
1638   auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
1639   auto input_alloc = _tensor_builder->at(inputData_index).get();
1640
1641   auto fn = std::make_unique<::arm_compute::CLTopKV2>();
1642
1643   fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
1644
1645   auto acl_fn = asAclClFunction(std::move(fn));
1646
1647   _return_fn = std::move(acl_fn);
1648 }
1649
1650 void KernelGenerator::visit(const ir::operation::Gather &node)
1651 {
1652   const auto ofm_index{node.getOutputs().at(0)};
1653
1654   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1655   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1656
1657   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1658   const auto axis_raw = node.param().axis;
1659   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1660   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1661
1662   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1663   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1664   auto indices_alloc = _tensor_builder->at(indices_index).get();
1665
1666   // NOTE The frontend layout and backend layout must be the same for this operation.
1667   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1668   //      is not not efficient even if it works well. If so, it would be better to set the
1669   //      layout of these backend tensors to the same layout.
1670   //      There is also one thing we have to think about. This operation depends on the layout of
1671   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1672   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1673   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1674   const auto backend_layout = ofm_alloc->layout();
1675   UNUSED_RELEASE(backend_layout);
1676   assert(backend_layout == ifm_alloc->layout());
1677   assert(backend_layout == indices_alloc->layout());
1678   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
1679
1680   auto fn = std::make_unique<::arm_compute::CLGatherEx>();
1681
1682   // input is n-D, indices k-D, output is (n + k - 1)-D
1683   size_t n = ifm_rank;
1684   assert(n == ifm_alloc->num_dimensions());
1685   size_t k = _ctx.at(indices_index).shape().rank();
1686   assert(k == indices_alloc->num_dimensions());
1687
1688   // Disable applied dim_correction
1689   const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
1690   if (n != ifm_alloc->info()->num_dimensions())
1691   {
1692     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1693     const auto ifm = _ctx.at(ifm_index);
1694     ifm_alloc->info()->set_tensor_shape(
1695         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
1696   }
1697   const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
1698   if (k != indices_alloc->info()->num_dimensions())
1699   {
1700     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1701     const auto indices = _ctx.at(indices_index);
1702     indices_alloc->info()->set_tensor_shape(
1703         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
1704   }
1705
1706   fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
1707
1708   // Revert disabling applied dim_correction
1709   ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
1710   indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
1711
1712   auto acl_fn = asAclClFunction(std::move(fn));
1713
1714   _return_fn = std::move(acl_fn);
1715 }
1716
1717 void KernelGenerator::visit(const ir::operation::Neg &node)
1718 {
1719   const auto ofm_index{node.getOutputs().at(0)};
1720   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
1721
1722   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1723   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1724
1725   auto fn = std::make_unique<::arm_compute::CLNeg>();
1726
1727   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1728
1729   auto acl_fn = asAclClFunction(std::move(fn));
1730
1731   _return_fn = std::move(acl_fn);
1732 }
1733
1734 void KernelGenerator::visit(const ir::operation::Abs &node)
1735 {
1736   const auto output_index{node.getOutputs().at(0)};
1737   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
1738
1739   auto output_alloc = _tensor_builder->at(output_index).get();
1740   auto input_alloc = _tensor_builder->at(input_index).get();
1741
1742   const ::arm_compute::ActivationLayerInfo act_info{
1743       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
1744
1745   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1746
1747   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1748
1749   auto acl_fn = asAclClFunction(std::move(fn));
1750
1751   _return_fn = std::move(acl_fn);
1752 }
1753
1754 void KernelGenerator::visit(const ir::operation::ArgMax &node)
1755 {
1756   const auto ofm_index{node.getOutputs().at(0)};
1757   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
1758
1759   auto ifm_shape = _ctx.at(ifm_index).shape();
1760   auto ofm_shape = _ctx.at(ofm_index).shape();
1761
1762   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1763
1764   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1765   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1766   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1767   auto frontend_layout = _current_op_seq_layout;
1768   auto backend_layout = ifm_alloc->layout();
1769
1770   int axis_value = node.param().axis;
1771   if (axis_value < 0)
1772   {
1773     axis_value += ifm_rank;
1774   }
1775
1776   auto acl_axis =
1777       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1778
1779   auto fn = std::make_unique<::arm_compute::CLArgOperation>();
1780
1781   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
1782                 ::arm_compute::ArgOperation::MAX);
1783
1784   auto acl_fn = asAclClFunction(std::move(fn));
1785
1786   _return_fn = std::move(acl_fn);
1787 }
1788
1789 void KernelGenerator::visit(const ir::operation::Dequantize &node)
1790 {
1791   const auto output_index{node.getOutputs().at(0)};
1792   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
1793
1794   auto output_alloc = _tensor_builder->at(output_index).get();
1795   auto input_alloc = _tensor_builder->at(input_index).get();
1796
1797   auto fn = std::make_unique<::arm_compute::CLCast>();
1798
1799   fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
1800
1801   auto acl_fn = asAclClFunction(std::move(fn));
1802
1803   _return_fn = std::move(acl_fn);
1804 }
1805
1806 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1807 {
1808   const auto ofm_index{node.getOutputs().at(0)};
1809   const auto ifm_index{
1810       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1811
1812   auto radius = node.param().radius;
1813   auto alpha = node.param().alpha;
1814   auto beta = node.param().beta;
1815   auto bias = node.param().bias;
1816
1817   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1818   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1819
1820   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1821       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1822
1823   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
1824
1825   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1826
1827   auto acl_fn = asAclClFunction(std::move(fn));
1828
1829   _return_fn = std::move(acl_fn);
1830 }
1831
1832 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1833 {
1834   const auto output_index{node.getOutputs().at(0)};
1835   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1836
1837   auto block_size = node.param().block_size;
1838   assert(block_size > 0);
1839
1840   auto output_alloc = _tensor_builder->at(output_index).get();
1841   auto input_alloc = _tensor_builder->at(input_index).get();
1842
1843   auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
1844
1845   fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
1846
1847   auto acl_fn = asAclClFunction(std::move(fn));
1848
1849   _return_fn = std::move(acl_fn);
1850 }
1851
1852 void KernelGenerator::visit(const ir::operation::Split &node)
1853 {
1854   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1855
1856   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1857
1858   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1859   std::vector<ir::OperandIndex> output_indexes;
1860   for (const auto &output : node.getOutputs())
1861     output_indexes.emplace_back(output);
1862
1863   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1864   std::vector<arm_compute::ICLTensor *> output_allocs;
1865   for (const auto &ofm_ind : output_indexes)
1866     output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
1867
1868   const auto frontend_layout = _current_op_seq_layout;
1869   const auto backend_layout = ifm_alloc->layout();
1870   auto axis = node.param().axis;
1871   if (axis < 0)
1872     axis += ifm_rank;
1873   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1874
1875   auto fn = std::make_unique<::arm_compute::CLSplit>();
1876
1877   fn->configure(ifm_alloc->handle(), output_allocs, axis);
1878
1879   _return_fn = asAclClFunction(std::move(fn));
1880 }
1881
1882 void KernelGenerator::visit(const ir::operation::Unpack &node)
1883 {
1884   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1885   auto axis{node.param().axis};
1886
1887   const auto input_rank = _ctx.at(input_index).shape().rank();
1888
1889   std::vector<ir::OperandIndex> output_indexes;
1890   for (const auto &output_index : node.getOutputs())
1891     output_indexes.emplace_back(output_index);
1892
1893   auto input = _tensor_builder->at(input_index).get()->handle();
1894   std::vector<arm_compute::ICLTensor *> outputs;
1895   for (const auto &output_index : output_indexes)
1896     outputs.emplace_back(_tensor_builder->at(output_index)->handle());
1897
1898   const auto frontend_layout = _current_op_seq_layout;
1899   const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1900   if (axis < 0)
1901     axis += input_rank;
1902   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1903
1904   // Disable applied dim_correction
1905   std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1906   for (const auto &output_index : output_indexes)
1907   {
1908     size_t output_rank = _ctx.at(output_index).shape().rank();
1909     const auto &output_alloc = _tensor_builder->at(output_index);
1910     orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
1911     assert(output_rank == output_alloc->num_dimensions());
1912     if (output_rank != output_alloc->info()->num_dimensions())
1913     {
1914       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1915       output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1916           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1917     }
1918   }
1919
1920   auto fn = std::make_unique<::arm_compute::CLUnstack>();
1921
1922   fn->configure(input, outputs, axis);
1923
1924   _return_fn = asAclClFunction(std::move(fn));
1925 }
1926
1927 void KernelGenerator::visit(const ir::operation::Pad &node)
1928 {
1929   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1930   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1931   const auto output_index{node.getOutputs().at(0)};
1932   assert(_ctx.at(pad_index).data());
1933
1934   auto rank = _ctx.at(input_index).shape().rank();
1935   auto pad_base = _ctx.at(pad_index).data()->base();
1936
1937   auto input_type = _ctx.at(input_index).typeInfo();
1938   auto data_type = acl_common::asDataType(input_type.type());
1939   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1940   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1941
1942   auto input = _tensor_builder->at(input_index).get()->handle();
1943   auto output = _tensor_builder->at(output_index).get()->handle();
1944
1945   const auto frontend_layout = _current_op_seq_layout;
1946   const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1947
1948   ::arm_compute::PaddingList padding_list;
1949   padding_list.resize(rank);
1950   for (int32_t n = 0; n < rank; ++n)
1951   {
1952     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1953
1954     const auto axis =
1955         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1956     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1957   }
1958   auto fn = std::make_unique<::arm_compute::CLPadLayer>();
1959
1960   // Disable applied dim_correction
1961   size_t input_rank = _ctx.at(input_index).shape().rank();
1962   const auto &input_alloc = _tensor_builder->at(input_index);
1963   assert(input_rank == input_alloc->num_dimensions());
1964   if (input_rank != input_alloc->info()->num_dimensions())
1965   {
1966     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1967     input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1968         _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
1969   }
1970
1971   fn->configure(input, output, padding_list, pixel_value);
1972
1973   // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
1974   // It would produce a mistach of result
1975
1976   _return_fn = asAclClFunction(std::move(fn));
1977 }
1978
1979 void KernelGenerator::visit(const ir::operation::Min &node)
1980 {
1981   const auto ofm_index{node.getOutputs().at(0)};
1982   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
1983   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
1984
1985   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1986   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1987   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1988
1989   auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
1990
1991   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1992
1993   auto acl_fn = asAclClFunction(std::move(fn));
1994
1995   _return_fn = std::move(acl_fn);
1996 }
1997
1998 void KernelGenerator::visit(const ir::operation::Max &node)
1999 {
2000   const auto ofm_index{node.getOutputs().at(0)};
2001   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
2002   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
2003
2004   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2005   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
2006   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
2007
2008   auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
2009
2010   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
2011
2012   auto acl_fn = asAclClFunction(std::move(fn));
2013
2014   _return_fn = std::move(acl_fn);
2015 }
2016
2017 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
2018 {
2019   const auto ofm_index{node.getOutputs().at(0)};
2020   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
2021
2022   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2023   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
2024
2025   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
2026
2027   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
2028                 0);
2029
2030   auto acl_fn = asAclClFunction(std::move(fn));
2031
2032   _return_fn = std::move(acl_fn);
2033 }
2034
2035 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
2036 {
2037   const auto ofm_index{node.getOutputs().at(0)};
2038   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
2039
2040   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2041   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
2042
2043   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
2044
2045   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
2046                 0);
2047
2048   auto acl_fn = asAclClFunction(std::move(fn));
2049
2050   _return_fn = std::move(acl_fn);
2051 }
2052
2053 } // namespace acl_cl
2054 } // namespace backend
2055 } // namespace onert