runtime/onert/backend/acl_neon/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
  20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
  21 #include <arm_compute/runtime/CPP/functions/CPPOneHotEx.h>
  22
  23 #include <AclActivationBuilder.h>
  24 #include <AclFunction.h>
  25 #include <Convert.h>
  26 #include <Swizzle.h>
  27
  28 #include "ir/Index.h"
  29 #include "ir/DataType.h"
  30 #include "ir/InternalType.h"
  31 #include "exec/NopFunction.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "AclKernelGen.h"
  35
  36 namespace onert
  37 {
  38 namespace backend
  39 {
  40 namespace acl_neon
  41 {
  42
  43 using ::onert::backend::acl_common::asAclFunction;
  44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  45     ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
  46
  47 KernelGenerator::KernelGenerator(
  48     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  49     const std::shared_ptr<TensorBuilder> &tensor_builder,
  50     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  51     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  52       _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
  53 {
  54   // DO NOTHING
  55 }
  56
  57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  58 {
  59   // TODO Move this to IKernelGenerator
  60   //      (all derivatives have the same implementation for this)
  61   assert(!_return_fn_seq);
  62   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  63   _return_fn_seq->enableDynamicShapeInferer(false);
  64
  65   _current_op_seq_layout = op_seq.getLayout();
  66   for (const auto &operation_idx : op_seq.operations())
  67   {
  68     const auto &node = _operations_ctx.at(operation_idx);
  69     node.accept(*this);
  70     _return_fn_seq->append(releaseFunction());
  71   }
  72 }
  73
  74 void KernelGenerator::visit(const ir::operation::ArgMax &node)
  75 {
  76   const auto ofm_index{node.getOutputs().at(0)};
  77   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
  78
  79   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
  80
  81   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
  82   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
  83   auto frontend_layout = _current_op_seq_layout;
  84   auto backend_layout = ifm_tensor->layout();
  85
  86   int axis_value = node.param().axis;
  87   if (axis_value < 0)
  88   {
  89     axis_value += ifm_rank;
  90   }
  91   assert(axis_value >= 0 && axis_value < ifm_rank);
  92   const auto fixed_axis =
  93       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
  94
  95   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
  96       ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
  97       arm_compute::ReductionOperation::ARG_IDX_MAX);
  98
  99   _return_fn = asAclFunction(std::move(fn));
 100 }
 101
 102 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 103 {
 104   const auto ofm_index{node.getOutputs().at(0)};
 105   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
 106   const auto block_size_index{
 107       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 108
 109   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 110   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 111   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
 112
 113   assert(_ctx.at(block_size_index).data());
 114
 115   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
 116       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 117
 118   _return_fn = asAclFunction(std::move(fn));
 119 }
 120
 121 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 122 {
 123   const auto ofm_index{node.getOutputs().at(0)};
 124   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 125   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 126
 127   const auto activation = node.param().activation;
 128
 129   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 130   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
 131   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
 132
 133   std::unique_ptr<arm_compute::IFunction> fn;
 134   switch (node.param().arithmetic_type)
 135   {
 136     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 137     {
 138       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
 139           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 140           arm_compute::ConvertPolicy::SATURATE);
 141       break;
 142     }
 143     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 144     {
 145       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
 146           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 147           arm_compute::ConvertPolicy::SATURATE);
 148       break;
 149     }
 150     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 151     {
 152       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
 153       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
 154           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 155           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 156       break;
 157     }
 158     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 159     {
 160       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
 161           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 162       break;
 163     }
 164     default:
 165       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 166       break;
 167   }
 168   _return_fn = std::make_unique<exec::FunctionSequence>(
 169       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 170 }
 171
 172 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 173 {
 174   using ir::operation::Conv2D;
 175
 176   const auto ofm_index{node.getOutputs().at(0)};
 177   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 178   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 179   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 180
 181   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 182   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 183   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 184   const auto &ker_shape = _ctx.at(ker_index).shape();
 185   const auto ker_height = ker_shape.dim(1);
 186   const auto ker_width = ker_shape.dim(2);
 187
 188   const auto stride = node.param().stride;
 189   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 190                                             ker_width, ker_height);
 191   const auto activation = node.param().activation;
 192
 193   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 194   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 195   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
 196   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 197
 198   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 199   const auto act_info = acl_common::asActivationLayerInfo(activation);
 200
 201   auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
 202       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 203       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 204       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 205
 206   _return_fn = asAclFunction(std::move(fn));
 207 }
 208
 209 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 210 {
 211   const auto output_index{node.getOutputs().at(0)};
 212   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
 213
 214   auto block_size = node.param().block_size;
 215   assert(block_size > 0);
 216
 217   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 218   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 219
 220   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
 221       input_tensor->handle(), output_tensor->handle(), block_size);
 222
 223   _return_fn = asAclFunction(std::move(fn));
 224 }
 225
 226 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 227 {
 228   using ir::operation::DepthwiseConv2D;
 229
 230   const auto ofm_index{node.getOutputs().at(0)};
 231   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 232   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 233   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 234
 235   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 236   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 237   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 238   const auto &ker_shape = _ctx.at(ker_index).shape();
 239   const auto ker_height = ker_shape.dim(1);
 240   const auto ker_width = ker_shape.dim(2);
 241
 242   const auto stride = node.param().stride;
 243   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 244                                             ker_width, ker_height);
 245   const auto multiplier = node.param().multiplier;
 246   const auto activation = node.param().activation;
 247
 248   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 249   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 250   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
 251   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 252
 253   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 254   const auto act_info = acl_common::asActivationLayerInfo(activation);
 255
 256   {
 257     auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
 258         ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 259         conv_info, multiplier, act_info);
 260
 261     _return_fn = asAclFunction(std::move(fn));
 262   }
 263 }
 264
 265 void KernelGenerator::visit(const ir::operation::Concat &node)
 266 {
 267   const auto ofm_index{node.getOutputs().at(0)};
 268
 269   std::vector<ir::OperandIndex> input_indexes;
 270   for (const auto &input : node.getInputs())
 271     input_indexes.emplace_back(input);
 272
 273   const auto axis = node.param().axis;
 274
 275   // Concat elimination check
 276   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 277   if (eliminated)
 278   {
 279     // If concat eliminated, return a NOP IFunction
 280     VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 281     _return_fn = std::make_unique<exec::NopFunction>();
 282     return;
 283   }
 284
 285   auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 286   std::vector<::arm_compute::ITensor *> input_tensors;
 287   for (const auto &ifm_ind : input_indexes)
 288     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 289
 290   std::unique_ptr<::arm_compute::IFunction> fn;
 291   if (input_indexes.size() < 2)
 292   {
 293     fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
 294                                                         output_tensor->handle());
 295   }
 296   else
 297   {
 298     const auto rank = _ctx.at(ofm_index).shape().rank();
 299     const auto frontend_layout = _current_op_seq_layout;
 300     const auto backend_layout = output_tensor->layout();
 301     const auto fixed_axis =
 302         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 303     fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
 304         input_tensors, output_tensor->handle(), fixed_axis);
 305   }
 306
 307   _return_fn = asAclFunction(std::move(fn));
 308 }
 309
 310 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 311 {
 312   const auto ofm_index{node.getOutputs().at(0)};
 313   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 314
 315   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 316   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 317
 318   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 319       node.param().op_type, node.param().alpha, node.param().beta);
 320
 321   std::unique_ptr<arm_compute::IFunction> fn;
 322   if (node.param().op_type == ir::operation::ElementwiseActivation::Type::LOGISTIC)
 323   {
 324     // NOTE NEActivationLayer can generate produce erroneous results. it were caused by
 325     // 'vexpq_f32()'.
 326     // The neon function returns a value outside of the limit of representation in float as 'NaN'
 327     // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
 328     fn = acl_common::generateLayer<arm_compute::NEActivationLayerEx>(
 329         ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 330   }
 331   else
 332   {
 333     fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
 334                                                                    ofm_tensor->handle(), act_info);
 335   }
 336
 337   _return_fn = asAclFunction(std::move(fn));
 338 }
 339
 340 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 341 {
 342   const auto output_index{node.getOutputs().at(0)};
 343   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 344   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 345
 346   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 347   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
 348   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
 349
 350   std::unique_ptr<arm_compute::IFunction> fn;
 351   switch (node.param().op_type)
 352   {
 353     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 354     {
 355       fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
 356           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 357       break;
 358     }
 359     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 360     {
 361       fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
 362           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 363       break;
 364     }
 365     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 366     {
 367       fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
 368           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 369       break;
 370     }
 371     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 372     {
 373       fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
 374           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 375       break;
 376     }
 377     default:
 378     {
 379       std::string err_msg("acl_neon KernelGenerator : " + node.name() +
 380                           "is not elementwise-binary operations");
 381       assert(false && err_msg.c_str());
 382       break;
 383     }
 384   }
 385   _return_fn = asAclFunction(std::move(fn));
 386 }
 387
 388 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 389 {
 390   const auto output_index{node.getOutputs().at(0)};
 391   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 392
 393   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 394   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 395
 396   std::unique_ptr<arm_compute::IFunction> fn;
 397   switch (node.param().op_type)
 398   {
 399     case ir::operation::ElementwiseUnary::Type::ABS:
 400     {
 401       const ::arm_compute::ActivationLayerInfo act_info{
 402           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 403
 404       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 405           input_tensor->handle(), output_tensor->handle(), act_info);
 406       break;
 407     }
 408     case ir::operation::ElementwiseUnary::Type::CAST:
 409     {
 410       if (input_tensor->data_type() == output_tensor->data_type())
 411       {
 412         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
 413                                                             output_tensor->handle());
 414       }
 415       else
 416       {
 417         fn = acl_common::generateLayer<arm_compute::NECast>(
 418             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 419       }
 420       break;
 421     }
 422     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 423     {
 424       fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
 425                                                                          output_tensor->handle());
 426       break;
 427     }
 428     case ir::operation::ElementwiseUnary::Type::EXP:
 429     {
 430       fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
 431                                                               output_tensor->handle());
 432       break;
 433     }
 434     case ir::operation::ElementwiseUnary::Type::FLOOR:
 435     {
 436       fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
 437                                                            output_tensor->handle());
 438       break;
 439     }
 440     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 441     {
 442       fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
 443                                                                 output_tensor->handle());
 444       break;
 445     }
 446     case ir::operation::ElementwiseUnary::Type::NEG:
 447     {
 448       fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
 449                                                               output_tensor->handle());
 450       break;
 451     }
 452     case ir::operation::ElementwiseUnary::Type::RSQRT:
 453     {
 454       fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
 455                                                                 output_tensor->handle());
 456       break;
 457     }
 458     case ir::operation::ElementwiseUnary::Type::SQRT:
 459     {
 460       const ::arm_compute::ActivationLayerInfo act_info{
 461           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 462
 463       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 464           input_tensor->handle(), output_tensor->handle(), act_info);
 465       break;
 466     }
 467     default:
 468     {
 469       throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
 470                                "is not supported yet");
 471       break;
 472     }
 473   }
 474   _return_fn = asAclFunction(std::move(fn));
 475 }
 476
 477 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 478 {
 479   const auto output_index{node.getOutputs().at(0)};
 480   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
 481   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 482
 483   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 484   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
 485   auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
 486
 487   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
 488       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 489
 490   _return_fn = asAclFunction(std::move(fn));
 491 }
 492
 493 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 494 {
 495   const auto output_index{node.getOutputs().at(0)};
 496   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 497   const auto activation = node.param().activation;
 498
 499   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
 500                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
 501       node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
 502   _return_fn = std::make_unique<exec::FunctionSequence>(
 503       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 504 }
 505
 506 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
 507 {
 508   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
 509   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
 510
 511   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
 512   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
 513   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 514
 515   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 516   auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
 517
 518   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
 519   auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
 520   auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
 521
 522   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
 523       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
 524       output_tensor->handle(), hits_tensor->handle());
 525
 526   _return_fn = asAclFunction(std::move(fn));
 527 }
 528
 529 void KernelGenerator::visit(const ir::operation::Gather &node)
 530 {
 531   const auto ofm_index{node.getOutputs().at(0)};
 532
 533   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
 534   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 535
 536   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 537   const auto axis_raw = node.param().axis;
 538   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
 539   // Converting in reverse order
 540   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 541
 542   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 543   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 544   auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
 545   const auto backend_layout = ofm_tensor->layout();
 546   UNUSED_RELEASE(backend_layout);
 547
 548   // NOTE The frontend layout and backend layout must be the same for this operation.
 549   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
 550   //      is not not efficient even if it works well. If so, it would be better to set the
 551   //      layout of these backend tensors to the same layout.
 552   //      There is also one thing we have to think about. This operation depends on the layout of
 553   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
 554   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
 555   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
 556   assert(backend_layout == ifm_tensor->layout());
 557   assert(backend_layout == indices_tensor->layout());
 558   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 559
 560   // input is n-D, indices k-D, output is (n + k - 1)-D
 561   size_t n = ifm_rank;
 562   assert(n == ifm_tensor->num_dimensions());
 563   size_t k = _ctx.at(indices_index).shape().rank();
 564   assert(k == indices_tensor->num_dimensions());
 565
 566   // Disable applied dim_correction
 567   if (n != ifm_tensor->info()->num_dimensions())
 568   {
 569     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 570     const auto ifm = _ctx.at(ifm_index);
 571     ifm_tensor->info()->set_tensor_shape(
 572         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
 573   }
 574   if (k != indices_tensor->info()->num_dimensions())
 575   {
 576     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
 577     const auto indices = _ctx.at(indices_index);
 578     indices_tensor->info()->set_tensor_shape(
 579         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
 580   }
 581
 582   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
 583       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 584
 585   // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
 586   // use arm_compute::TensorInfo::offset_element_in_bytes()
 587   // It would create an error when the kernel accesses high dimension that its value is 1
 588
 589   _return_fn = asAclFunction(std::move(fn));
 590 }
 591
 592 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 593 {
 594   const auto ofm_index{node.getOutputs().at(0)};
 595   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 596   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 597   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 598
 599   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 600   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 601   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
 602   auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
 603   auto epsilon = node.param().epsilon;
 604   auto activation = node.param().activation;
 605
 606   auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
 607       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 608       epsilon);
 609
 610   _return_fn = std::make_unique<exec::FunctionSequence>(
 611       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 612 }
 613
 614 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 615 {
 616   const auto ofm_index{node.getOutputs().at(0)};
 617   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
 618
 619   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
 620   // L2 Normalization always performs the reduction along the depth axis
 621   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
 622   // choosing normalization parameters as below
 623
 624   const auto &ifm_shape = _ctx.at(ifm_index).shape();
 625   // TODO Support optional constant dimension that normalization would be performed on
 626   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
 627   int32_t radius =
 628       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
 629   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
 630   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
 631   float bias = 0.0f;                             // Don't offset the reduction.
 632
 633   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 634   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 635
 636   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
 637                                                                radius, alpha, beta, bias, false);
 638
 639   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 640       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 641
 642   _return_fn = asAclFunction(std::move(fn));
 643 }
 644
 645 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
 646 {
 647   const auto ofm_index{node.getOutputs().at(0)};
 648   const auto ifm_index{
 649       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 650
 651   auto radius = node.param().radius;
 652   auto alpha = node.param().alpha;
 653   auto beta = node.param().beta;
 654   auto bias = node.param().bias;
 655
 656   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 657   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 658
 659   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
 660       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 661
 662   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 663       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 664
 665   _return_fn = asAclFunction(std::move(fn));
 666 }
 667
 668 void KernelGenerator::visit(const ir::operation::LSTM &node)
 669 {
 670   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
 671                                          ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
 672 }
 673
 674 void KernelGenerator::visit(const ir::operation::Pack &node)
 675 {
 676   const auto output_index{node.getOutputs().at(0)};
 677   auto axis{node.param().axis};
 678
 679   const auto output_rank = _ctx.at(output_index).shape().rank();
 680
 681   std::vector<ir::OperandIndex> input_indexes;
 682   for (const auto &input_index : node.getInputs())
 683     input_indexes.emplace_back(input_index);
 684
 685   auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
 686   std::vector<arm_compute::ITensor *> inputs;
 687   for (const auto &input_index : input_indexes)
 688     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 689
 690   const auto frontend_layout = _current_op_seq_layout;
 691   const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
 692
 693   if (axis < 0)
 694     axis += output_rank;
 695   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 696
 697   // Disable applied dim_correction
 698   for (const auto &input_index : input_indexes)
 699   {
 700     size_t input_rank = _ctx.at(input_index).shape().rank();
 701     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 702     assert(input_rank == input_tensor->num_dimensions());
 703     if (input_rank != input_tensor->info()->num_dimensions())
 704     {
 705       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 706       input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
 707           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
 708     }
 709   }
 710
 711   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 712
 713   // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
 714   // use arm_compute::TensorInfo::offset_element_in_bytes()
 715   // It would create an error when the kernel accesses high dimension that its value is 1
 716
 717   _return_fn = asAclFunction(std::move(fn));
 718 }
 719
 720 void KernelGenerator::visit(const ir::operation::Pad &node)
 721 {
 722   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
 723   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
 724   const auto output_index{node.getOutputs().at(0)};
 725   assert(_ctx.at(pad_index).data());
 726
 727   auto rank = _ctx.at(input_index).shape().rank();
 728   auto pad_base = _ctx.at(pad_index).data()->base();
 729
 730   auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
 731   auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
 732
 733   ::arm_compute::PaddingList padding_list;
 734   padding_list.resize(rank);
 735   for (int32_t n = 0; n < rank; ++n)
 736   {
 737     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 738
 739     const auto frontend_layout = _current_op_seq_layout;
 740     const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
 741     const auto axis =
 742         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
 743     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
 744   }
 745
 746   const auto input_type = _ctx.at(input_index).typeInfo();
 747   UNUSED_RELEASE(input_type);
 748   assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
 749   assert(input->info()->quantization_info() ==
 750          ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
 751   const auto pixel_value =
 752       ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
 753
 754   auto fn =
 755       acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
 756
 757   _return_fn = asAclFunction(std::move(fn));
 758 }
 759
 760 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 761 {
 762   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
 763       node, _ctx, _tensor_reg, _current_op_seq_layout,
 764       acl_common::convertPoolType(node.param().op_type));
 765
 766   const auto ofm_index{node.getOutputs().at(0)};
 767   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 768   const auto activation = node.param().activation;
 769   _return_fn = std::make_unique<exec::FunctionSequence>(
 770       asAclFunction(std::move(raw_fn)),
 771       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 772 }
 773
 774 void KernelGenerator::visit(const ir::operation::Permute &node)
 775 {
 776   const auto ofm_idx{node.getOutputs().at(0)};
 777   const auto ifm_idx{node.getInputs().at(0)};
 778   const auto permute_type = node.getPermuteType();
 779   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
 780   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
 781   const auto rank = _ctx.at(ofm_idx).shape().rank();
 782   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 783
 784   std::unique_ptr<::arm_compute::IFunction> fn;
 785   arm_compute::PermutationVector pv;
 786   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 787   {
 788     // WHCN -> CWHN
 789     pv = arm_compute::PermutationVector{2, 0, 1};
 790
 791     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 792                                                            ofm_tensor->handle(), pv);
 793   }
 794   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 795   {
 796     // CWHN -> WHCN
 797     pv = arm_compute::PermutationVector{1, 2, 0};
 798
 799     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 800                                                            ofm_tensor->handle(), pv);
 801   }
 802   else
 803   {
 804     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
 805   }
 806   _return_fn = asAclFunction(std::move(fn));
 807 }
 808
 809 void KernelGenerator::visit(const ir::operation::PReLU &node)
 810 {
 811   const auto ofm_index{node.getOutputs().at(0)};
 812   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
 813   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 814
 815   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 816   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 817   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
 818
 819   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
 820       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 821
 822   _return_fn = asAclFunction(std::move(fn));
 823 }
 824
 825 void KernelGenerator::visit(const ir::operation::Reduce &node)
 826 {
 827   const auto output_index{node.getOutputs().at(0)};
 828   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 829   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 830
 831   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 832   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 833
 834   // Convert to ACL axes taking into account negative values and possible duplicates.
 835   const auto &axes = _ctx.at(axes_index);
 836   const auto input_rank = _ctx.at(input_index).shape().rank();
 837   const auto frontend_layout = _current_op_seq_layout;
 838   const auto backend_layout = input_tensor->layout();
 839   const auto reduce_axes =
 840       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 841   const auto reduce_type = node.param().reduce_type;
 842   const auto keep_dims = node.param().keep_dims;
 843
 844   std::unique_ptr<::arm_compute::IFunction> fn;
 845   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 846   {
 847     fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
 848                                                               keep_dims, output_tensor->handle());
 849   }
 850   else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
 851   {
 852     fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
 853                                                              keep_dims, output_tensor->handle());
 854   }
 855   else
 856   {
 857     fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
 858         input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
 859         acl_common::convertReduceType(reduce_type));
 860   }
 861   _return_fn = asAclFunction(std::move(fn));
 862 }
 863
 864 void KernelGenerator::visit(const ir::operation::Reshape &node)
 865 {
 866   const auto output_index{node.getOutputs().at(0)};
 867   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 868
 869   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 870   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 871
 872   // NOTE This operation must not be changed the layout from frontend to backend
 873   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 874   const auto frontend_layout = _current_op_seq_layout;
 875   const auto backend_layout = output_tensor->layout();
 876   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 877          frontend_layout == backend_layout);
 878   UNUSED_RELEASE(frontend_layout);
 879   UNUSED_RELEASE(backend_layout);
 880
 881   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 882                                                                    output_tensor->handle());
 883
 884   _return_fn = asAclFunction(std::move(fn));
 885 }
 886
 887 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 888 {
 889   const auto ofm_index{node.getOutputs().at(0)};
 890
 891   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 892
 893   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 894   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 895
 896   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
 897       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
 898       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
 899       ::arm_compute::SamplingPolicy::TOP_LEFT);
 900
 901   _return_fn = asAclFunction(std::move(fn));
 902 }
 903
 904 void KernelGenerator::visit(const ir::operation::RNN &node)
 905 {
 906   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
 907   const auto hidden_state_out_index{
 908       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 909
 910   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
 911   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
 912   const auto recurrent_weights_index{
 913       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
 914   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
 915   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 916
 917   const auto activation = node.param().activation;
 918
 919   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 920   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
 921
 922   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 923   auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
 924   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
 925   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 926   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
 927   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 928
 929   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
 930       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
 931   _return_fn = asAclFunction(std::move(copy_layer));
 932
 933   auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
 934       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 935       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
 936       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
 937   _return_fn = asAclFunction(std::move(fn));
 938 }
 939
 940 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 941 {
 942   // Squeeze is identical to reshape except that it has an optional dimensions input.
 943   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 944   // by freezer and toco
 945   const auto output_index{node.getOutputs().at(0)};
 946   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 947   const auto dims{node.param().dims};
 948   const auto ndim{node.param().ndim};
 949   (void)dims;
 950   (void)ndim;
 951
 952   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 953   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 954   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 955                                                                    output_tensor->handle());
 956   _return_fn = asAclFunction(std::move(fn));
 957 }
 958
 959 void KernelGenerator::visit(const ir::operation::Softmax &node)
 960 {
 961   const auto output_index{node.getOutputs().at(0)};
 962   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 963   const auto beta = node.param().beta;
 964
 965   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 966   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 967   const auto frontend_layout = _current_op_seq_layout;
 968   const auto backend_layout = input_tensor->layout();
 969
 970   // Disable applied dim_correction
 971   const size_t input_rank = _ctx.at(input_index).shape().rank();
 972   if (input_rank != input_tensor->info()->num_dimensions())
 973   {
 974     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 975     const auto input = _ctx.at(input_index);
 976     input_tensor->info()->set_tensor_shape(
 977         acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
 978   }
 979
 980   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
 981       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 982       output_tensor->handle(), beta);
 983
 984   _return_fn = asAclFunction(std::move(fn));
 985 }
 986
 987 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
 988 {
 989   const auto ofm_index{node.getOutputs().at(0)};
 990   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
 991   const auto block_size_index{
 992       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
 993   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 994
 995   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 996   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 997   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
 998   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
 999
1000   assert(_ctx.at(block_size_index).data());
1001   assert(_ctx.at(paddings_index).data());
1002
1003   auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1004       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1005       ofm_tensor->handle());
1006
1007   _return_fn = asAclFunction(std::move(fn));
1008 }
1009
1010 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1011 {
1012   const auto ofm_index{node.getOutputs().at(0)};
1013   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1014
1015   auto block_size = node.param().block_size;
1016
1017   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1018   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1019
1020   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1021       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1022
1023   _return_fn = asAclFunction(std::move(fn));
1024 }
1025
1026 void KernelGenerator::visit(const ir::operation::Split &node)
1027 {
1028   // TODO Support this op by SubTensor
1029   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1030
1031   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1032
1033   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1034   std::vector<ir::OperandIndex> output_indexes;
1035   for (const auto &output : node.getOutputs())
1036     output_indexes.emplace_back(output);
1037
1038   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1039   std::vector<arm_compute::ITensor *> output_tensors;
1040   for (const auto &ofm_ind : output_indexes)
1041     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
1042
1043   const auto frontend_layout = _current_op_seq_layout;
1044   const auto backend_layout = ifm_tensor->layout();
1045   auto axis = node.param().axis;
1046   if (axis < 0)
1047     axis += ifm_rank;
1048   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1049
1050   auto fn =
1051       acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1052
1053   _return_fn = asAclFunction(std::move(fn));
1054 }
1055
1056 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1057 {
1058   const auto ofm_index{node.getOutputs().at(0)};
1059   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1060   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1061
1062   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1063   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
1064   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
1065
1066   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1067       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1068
1069   _return_fn = asAclFunction(std::move(fn));
1070 }
1071
1072 void KernelGenerator::visit(const ir::operation::Slice &node)
1073 {
1074   const auto output_index{node.getOutputs().at(0)};
1075   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1076   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1077   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1078
1079   auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
1080   auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
1081   const auto frontend_layout = _current_op_seq_layout;
1082   const auto backend_layout = inputData_tensor->layout();
1083
1084   // Set initializers for indices data such as order of inputData
1085   int input_rank = _ctx.at(input_index).shape().rank();
1086   std::vector<int32_t> starts;
1087   std::vector<int32_t> ends;
1088   starts.resize(input_rank, 0);
1089   ends.resize(input_rank, 0);
1090   {
1091     auto beginData_base = _ctx.at(begins_index).data()->base();
1092     auto sizeData_base = _ctx.at(sizes_index).data()->base();
1093     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1094     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1095
1096     using ir::DataType;
1097
1098     UNUSED_RELEASE(beginData_size);
1099     UNUSED_RELEASE(sizeData_size);
1100
1101     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1102     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1103     assert(beginData_size == input_rank);
1104     assert(sizeData_size == input_rank);
1105
1106     assert(beginData_base != nullptr);
1107     for (int n = 0; n < input_rank; ++n)
1108     {
1109       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1110                                                                  backend_layout)
1111                       .value();
1112
1113       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1114       starts[axis] = begin_value;
1115
1116       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1117       ends[axis] = begin_value + size_value;
1118     }
1119   }
1120
1121   ::arm_compute::Coordinates starts_set;
1122   ::arm_compute::Coordinates ends_set;
1123
1124   for (size_t i = 0; i < starts.size(); ++i)
1125   {
1126     starts_set.set(i, starts[i]);
1127     ends_set.set(i, ends[i]);
1128   }
1129
1130   auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1131       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1132
1133   _return_fn = asAclFunction(std::move(fn));
1134 }
1135
1136 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1137 {
1138   const auto output_index{node.getOutputs().at(0)};
1139   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1140   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1141   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1142   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1143
1144   auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
1145   auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
1146   const auto frontend_layout = _current_op_seq_layout;
1147   const auto backend_layout = inputData_tensor->layout();
1148
1149   // Set initializers for indices data such as order of inputData
1150   int input_rank = _ctx.at(input_index).shape().rank();
1151   std::vector<int32_t> starts;
1152   std::vector<int32_t> ends;
1153   std::vector<int32_t> strides;
1154   starts.resize(input_rank, 0);
1155   ends.resize(input_rank, 0);
1156   strides.resize(input_rank, 0);
1157   {
1158     auto startData_base = _ctx.at(starts_index).data()->base();
1159     auto endData_base = _ctx.at(ends_index).data()->base();
1160     auto stridesData_base = _ctx.at(strides_index).data()->base();
1161     const int startData_size = _ctx.at(starts_index).shape().num_elements();
1162     const int endData_size = _ctx.at(ends_index).shape().num_elements();
1163     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1164
1165     using ir::DataType;
1166
1167     UNUSED_RELEASE(startData_size);
1168     UNUSED_RELEASE(endData_size);
1169     UNUSED_RELEASE(stridesData_size);
1170
1171     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1172     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1173     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1174     assert(startData_size == input_rank);
1175     assert(endData_size == input_rank);
1176     assert(stridesData_size == input_rank);
1177
1178     assert(startData_base != nullptr);
1179     for (int n = 0; n < input_rank; ++n)
1180     {
1181       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1182                                                                  backend_layout)
1183                       .value();
1184
1185       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1186       starts[axis] = start_value;
1187
1188       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1189       ends[axis] = end_value;
1190
1191       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1192       strides[axis] = strides_value;
1193     }
1194   }
1195
1196   // Set mask bits such as order of inputData
1197   // FIXME Take the layouts into account.
1198   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1199   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1200   const auto shrink_axis_mask =
1201       acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1202
1203   ::arm_compute::Coordinates starts_set;
1204   ::arm_compute::Coordinates ends_set;
1205   ::arm_compute::BiStrides strides_set;
1206
1207   for (size_t i = 0; i < starts.size(); ++i)
1208   {
1209     starts_set.set(i, starts[i]);
1210     ends_set.set(i, ends[i]);
1211     strides_set.set(i, strides[i]);
1212   }
1213
1214   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1215       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1216       begin_mask, end_mask, shrink_axis_mask);
1217
1218   _return_fn = asAclFunction(std::move(fn));
1219 }
1220
1221 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1222 {
1223   const auto ofm_index{node.getOutputs().at(0)};
1224   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1225   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1226
1227   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1228   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1229   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1230
1231   const auto stride = node.param().stride;
1232
1233   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1234          (node.param().padding.type == ir::PaddingType::VALID));
1235   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1236                                       ker_shape.W, ker_shape.H);
1237
1238   uint32_t invalid_horizontal = 0;
1239   uint32_t invalid_vertical = 0;
1240   if (node.param().padding.type == ir::PaddingType::VALID)
1241   {
1242     invalid_horizontal =
1243         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1244     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1245   }
1246
1247   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1248   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1249   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
1250
1251   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1252
1253   auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1254       ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1255       invalid_horizontal, invalid_vertical);
1256
1257   _return_fn = asAclFunction(std::move(fn));
1258 }
1259
1260 void KernelGenerator::visit(const ir::operation::Transpose &node)
1261 {
1262   const auto ofm_idx{node.getOutputs().at(0)};
1263   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1264   const auto &perm{node.param().perm};
1265
1266   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
1267   const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
1268   const auto frontend_layout = _current_op_seq_layout;
1269   const auto backend_layout = ifm_tensor->layout();
1270
1271   const auto rank = _ctx.at(ifm_idx).shape().rank();
1272   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
1273   auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
1274       rank, pv, frontend_layout, backend_layout);
1275
1276   std::unique_ptr<::arm_compute::IFunction> fn;
1277   if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
1278   {
1279     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1280                                                              ofm_tensor->handle());
1281   }
1282   else
1283   {
1284     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1285                                                            ofm_tensor->handle(), backend_pv);
1286   }
1287   _return_fn = asAclFunction(std::move(fn));
1288 }
1289
1290 void KernelGenerator::visit(const ir::operation::Unpack &node)
1291 {
1292   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1293   auto axis{node.param().axis};
1294
1295   const auto input_rank = _ctx.at(input_index).shape().rank();
1296
1297   std::vector<ir::OperandIndex> output_indexes;
1298   for (const auto &output_index : node.getOutputs())
1299     output_indexes.emplace_back(output_index);
1300
1301   auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1302   std::vector<arm_compute::ITensor *> outputs;
1303   for (const auto &output_index : output_indexes)
1304     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1305
1306   const auto frontend_layout = _current_op_seq_layout;
1307   const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1308   if (axis < 0)
1309     axis += input_rank;
1310   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1311
1312   // Disable applied dim_correction
1313   std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1314   for (const auto &output_index : output_indexes)
1315   {
1316     size_t output_rank = _ctx.at(output_index).shape().rank();
1317     const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
1318     orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
1319     assert(output_rank == output_tensor->num_dimensions());
1320     if (output_rank != output_tensor->info()->num_dimensions())
1321     {
1322       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1323       output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1324           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1325     }
1326   }
1327
1328   auto fn = acl_common::generateLayer<arm_compute::NEUnstack>(input, outputs, axis);
1329
1330   _return_fn = asAclFunction(std::move(fn));
1331 }
1332
1333 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1334 {
1335   const auto output_index{node.getOutputs().at(0)};
1336   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1337
1338   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1339   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
1340
1341   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1342                                                                    output_tensor->handle());
1343
1344   _return_fn = asAclFunction(std::move(fn));
1345 }
1346
1347 void KernelGenerator::visit(const ir::operation::Comparison &node)
1348 {
1349   const auto output_index{node.getOutputs().at(0)};
1350   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1351   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1352
1353   const auto comparison_type = node.param().comparison_type;
1354
1355   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1356   auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
1357   auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
1358
1359   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1360       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1361       (arm_compute::ComparisonOperation)comparison_type);
1362
1363   _return_fn = asAclFunction(std::move(fn));
1364 }
1365
1366 void KernelGenerator::visit(const ir::operation::OneHot &node)
1367 {
1368   const auto out_idx{node.getOutputs().at(0)};
1369   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1370   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1371   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1372   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1373   const auto axis = node.param().axis;
1374
1375   auto output_tensor = _tensor_reg->getAclTensor(out_idx).get();
1376   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx).get();
1377   auto depth_tensor = _tensor_reg->getAclTensor(depth_idx).get();
1378   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx).get();
1379   auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx).get();
1380
1381   auto fn = acl_common::generateLayer<arm_compute::CPPOneHotEx>(
1382       indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1383       offvalue_tensor->handle(), output_tensor->handle(), axis);
1384   _return_fn = asAclFunction(std::move(fn));
1385 }
1386
1387 } // namespace acl_neon
1388 } // namespace backend
1389 } // namespace onert