runtime/onert/backend/acl_neon/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
  20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "util/logging.h"
  32 #include "util/Utils.h"
  33 #include "AclKernelGen.h"
  34
  35 namespace onert
  36 {
  37 namespace backend
  38 {
  39 namespace acl_neon
  40 {
  41
  42 using ::onert::backend::acl_common::asAclFunction;
  43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  44     ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
  45
  46 KernelGenerator::KernelGenerator(
  47     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  48     const std::shared_ptr<TensorBuilder> &tensor_builder,
  49     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  50     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  51       _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
  52 {
  53   // DO NOTHING
  54 }
  55
  56 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  57 {
  58   // TODO Move this to IKernelGenerator
  59   //      (all derivatives have the same implementation for this)
  60   assert(!_return_fn_seq);
  61   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  62   _return_fn_seq->enableDynamicShapeInferer(false);
  63
  64   _current_op_seq_layout = op_seq.getLayout();
  65   for (const auto &operation_idx : op_seq.operations())
  66   {
  67     const auto &node = _operations_ctx.at(operation_idx);
  68     node.accept(*this);
  69     _return_fn_seq->append(releaseFunction());
  70   }
  71 }
  72
  73 void KernelGenerator::visit(const ir::operation::ArgMax &node)
  74 {
  75   const auto ofm_index{node.getOutputs().at(0)};
  76   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
  77   const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
  78
  79   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
  80
  81   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
  82   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
  83   auto frontend_layout = _current_op_seq_layout;
  84   auto backend_layout = ifm_tensor->layout();
  85
  86   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
  87   if (axis_value < 0)
  88   {
  89     axis_value += ifm_rank;
  90   }
  91   assert(axis_value >= 0 && axis_value < ifm_rank);
  92   const auto fixed_axis =
  93       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
  94
  95   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
  96       ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
  97       arm_compute::ReductionOperation::ARG_IDX_MAX);
  98
  99   _return_fn = asAclFunction(std::move(fn));
 100 }
 101
 102 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 103 {
 104   const auto ofm_index{node.getOutputs().at(0)};
 105   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
 106   const auto block_size_index{
 107       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 108
 109   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 110   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 111   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 112
 113   assert(_ctx.at(block_size_index).data());
 114
 115   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
 116       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 117
 118   _return_fn = asAclFunction(std::move(fn));
 119 }
 120
 121 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 122 {
 123   const auto ofm_index{node.getOutputs().at(0)};
 124   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 125   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 126
 127   const auto activation = node.param().activation;
 128
 129   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 130   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 131   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 132
 133   std::unique_ptr<arm_compute::IFunction> fn;
 134   switch (node.param().arithmetic_type)
 135   {
 136     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 137     {
 138       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
 139           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 140           arm_compute::ConvertPolicy::SATURATE);
 141       break;
 142     }
 143     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 144     {
 145       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
 146           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 147           arm_compute::ConvertPolicy::SATURATE);
 148       break;
 149     }
 150     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 151     {
 152       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
 153       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
 154           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 155           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 156       break;
 157     }
 158     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 159     {
 160       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
 161           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 162       break;
 163     }
 164     default:
 165       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 166       break;
 167   }
 168   _return_fn = std::make_unique<exec::FunctionSequence>(
 169       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 170 }
 171
 172 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 173 {
 174   using ir::operation::Conv2D;
 175
 176   const auto ofm_index{node.getOutputs().at(0)};
 177   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 178   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 179   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 180
 181   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 182   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 183   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 184   const auto &ker_shape = _ctx.at(ker_index).shape();
 185   const auto ker_height = ker_shape.dim(1);
 186   const auto ker_width = ker_shape.dim(2);
 187
 188   const auto stride = node.param().stride;
 189   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 190                                             ker_width, ker_height);
 191   const auto activation = node.param().activation;
 192
 193   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 194   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 195   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 196   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 197
 198   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 199   const auto act_info = acl_common::asActivationLayerInfo(activation);
 200
 201   auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
 202       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 203       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 204       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 205
 206   _return_fn = asAclFunction(std::move(fn));
 207 }
 208
 209 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 210 {
 211   const auto output_index{node.getOutputs().at(0)};
 212   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
 213
 214   auto block_size = node.param().block_size;
 215   assert(block_size > 0);
 216
 217   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 218   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 219
 220   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
 221       input_tensor->handle(), output_tensor->handle(), block_size);
 222
 223   _return_fn = asAclFunction(std::move(fn));
 224 }
 225
 226 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 227 {
 228   using ir::operation::DepthwiseConv2D;
 229
 230   const auto ofm_index{node.getOutputs().at(0)};
 231   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 232   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 233   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 234
 235   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 236   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 237   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 238   const auto &ker_shape = _ctx.at(ker_index).shape();
 239   const auto ker_height = ker_shape.dim(1);
 240   const auto ker_width = ker_shape.dim(2);
 241
 242   const auto stride = node.param().stride;
 243   const auto dilation = node.param().dilation;
 244   const auto padding =
 245       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
 246                            ker_height, dilation.width_factor, dilation.height_factor);
 247   const auto multiplier = node.param().multiplier;
 248   const auto activation = node.param().activation;
 249
 250   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 251   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 252   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 253   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 254
 255   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 256   const auto act_info = acl_common::asActivationLayerInfo(activation);
 257   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 258
 259   auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
 260       ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 261       conv_info, multiplier, act_info, dilation_info);
 262
 263   _return_fn = asAclFunction(std::move(fn));
 264 }
 265
 266 void KernelGenerator::visit(const ir::operation::Concat &node)
 267 {
 268   const auto ofm_index{node.getOutputs().at(0)};
 269
 270   std::vector<ir::OperandIndex> input_indexes;
 271   for (const auto &input : node.getInputs())
 272     input_indexes.emplace_back(input);
 273
 274   const auto axis = node.param().axis;
 275
 276   // Concat elimination check
 277   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 278   if (eliminated)
 279   {
 280     // If concat eliminated, return a NOP IFunction
 281     VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 282     _return_fn = std::make_unique<exec::NopFunction>();
 283     return;
 284   }
 285
 286   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 287   std::vector<::arm_compute::ITensor *> input_tensors;
 288   for (const auto &ifm_ind : input_indexes)
 289     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 290
 291   std::unique_ptr<::arm_compute::IFunction> fn;
 292   if (input_indexes.size() < 2)
 293   {
 294     fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
 295                                                         output_tensor->handle());
 296   }
 297   else
 298   {
 299     const auto rank = _ctx.at(ofm_index).shape().rank();
 300     const auto frontend_layout = _current_op_seq_layout;
 301     const auto backend_layout = output_tensor->layout();
 302     const auto fixed_axis =
 303         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 304     fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
 305         input_tensors, output_tensor->handle(), fixed_axis);
 306   }
 307
 308   _return_fn = asAclFunction(std::move(fn));
 309 }
 310
 311 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 312 {
 313   const auto ofm_index{node.getOutputs().at(0)};
 314   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 315
 316   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 317   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 318
 319   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 320       node.param().op_type, node.param().alpha, node.param().beta);
 321
 322   std::unique_ptr<arm_compute::IFunction> fn =
 323       acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
 324                                                                 ofm_tensor->handle(), act_info);
 325
 326   _return_fn = asAclFunction(std::move(fn));
 327 }
 328
 329 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 330 {
 331   const auto output_index{node.getOutputs().at(0)};
 332   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 333   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 334
 335   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 336   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 337   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 338
 339   std::unique_ptr<arm_compute::IFunction> fn;
 340   switch (node.param().op_type)
 341   {
 342     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 343     {
 344       fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
 345           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 346       break;
 347     }
 348     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 349     {
 350       fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
 351           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 352       break;
 353     }
 354     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 355     {
 356       fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
 357           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 358       break;
 359     }
 360     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 361     {
 362       fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
 363           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 364       break;
 365     }
 366     default:
 367     {
 368       std::string err_msg("acl_neon KernelGenerator : " + node.name() +
 369                           "is not elementwise-binary operations");
 370       assert(false && err_msg.c_str());
 371       break;
 372     }
 373   }
 374   _return_fn = asAclFunction(std::move(fn));
 375 }
 376
 377 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 378 {
 379   const auto output_index{node.getOutputs().at(0)};
 380   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 381
 382   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 383   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 384
 385   std::unique_ptr<arm_compute::IFunction> fn;
 386   switch (node.param().op_type)
 387   {
 388     case ir::operation::ElementwiseUnary::Type::ABS:
 389     {
 390       const ::arm_compute::ActivationLayerInfo act_info{
 391           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 392
 393       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 394           input_tensor->handle(), output_tensor->handle(), act_info);
 395       break;
 396     }
 397     case ir::operation::ElementwiseUnary::Type::CAST:
 398     {
 399       if (input_tensor->data_type() == output_tensor->data_type())
 400       {
 401         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
 402                                                             output_tensor->handle());
 403       }
 404       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 405       {
 406         fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
 407                                                                 output_tensor->handle());
 408       }
 409       else
 410       {
 411         fn = acl_common::generateLayer<arm_compute::NECast>(
 412             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 413       }
 414       break;
 415     }
 416     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 417     {
 418       fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
 419                                                                          output_tensor->handle());
 420       break;
 421     }
 422     case ir::operation::ElementwiseUnary::Type::EXP:
 423     {
 424       fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
 425                                                               output_tensor->handle());
 426       break;
 427     }
 428     case ir::operation::ElementwiseUnary::Type::FLOOR:
 429     {
 430       fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
 431                                                            output_tensor->handle());
 432       break;
 433     }
 434     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 435     {
 436       fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
 437                                                                 output_tensor->handle());
 438       break;
 439     }
 440     case ir::operation::ElementwiseUnary::Type::NEG:
 441     {
 442       fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
 443                                                               output_tensor->handle());
 444       break;
 445     }
 446     case ir::operation::ElementwiseUnary::Type::RSQRT:
 447     {
 448       fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
 449                                                                 output_tensor->handle());
 450       break;
 451     }
 452     case ir::operation::ElementwiseUnary::Type::SQRT:
 453     {
 454       const ::arm_compute::ActivationLayerInfo act_info{
 455           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 456
 457       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 458           input_tensor->handle(), output_tensor->handle(), act_info);
 459       break;
 460     }
 461     default:
 462     {
 463       throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
 464                                "is not supported yet");
 465       break;
 466     }
 467   }
 468   _return_fn = asAclFunction(std::move(fn));
 469 }
 470
 471 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 472 {
 473   const auto output_index{node.getOutputs().at(0)};
 474   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
 475   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 476
 477   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 478   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 479   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 480
 481   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
 482       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 483
 484   _return_fn = asAclFunction(std::move(fn));
 485 }
 486
 487 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 488 {
 489   const auto output_index{node.getOutputs().at(0)};
 490   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 491   const auto activation = node.param().activation;
 492   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 493     throw std::runtime_error(
 494         "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
 495
 496   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
 497                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
 498       node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
 499   _return_fn = std::make_unique<exec::FunctionSequence>(
 500       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 501 }
 502
 503 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
 504 {
 505   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
 506   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
 507
 508   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
 509   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
 510   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 511
 512   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 513   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 514
 515   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 516   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
 517   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 518
 519   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
 520       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
 521       output_tensor->handle(), hits_tensor->handle());
 522
 523   _return_fn = asAclFunction(std::move(fn));
 524 }
 525
 526 void KernelGenerator::visit(const ir::operation::Gather &node)
 527 {
 528   const auto ofm_index{node.getOutputs().at(0)};
 529
 530   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
 531   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 532
 533   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 534   const auto axis_raw = node.param().axis;
 535   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
 536   // Converting in reverse order
 537   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 538
 539   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 540   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 541   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 542   const auto backend_layout = ofm_tensor->layout();
 543   UNUSED_RELEASE(backend_layout);
 544
 545   // NOTE The frontend layout and backend layout must be the same for this operation.
 546   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
 547   //      is not not efficient even if it works well. If so, it would be better to set the
 548   //      layout of these backend tensors to the same layout.
 549   //      There is also one thing we have to think about. This operation depends on the layout of
 550   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
 551   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
 552   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
 553   assert(backend_layout == ifm_tensor->layout());
 554   assert(backend_layout == indices_tensor->layout());
 555   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 556
 557   // input is n-D, indices k-D, output is (n + k - 1)-D
 558   size_t n = ifm_rank;
 559   assert(n == ifm_tensor->num_dimensions());
 560   size_t k = _ctx.at(indices_index).shape().rank();
 561   assert(k == indices_tensor->num_dimensions());
 562
 563   // Disable applied dim_correction
 564   if (n != ifm_tensor->info()->num_dimensions())
 565   {
 566     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 567     acl_common::disableDimCorrection(ifm_tensor);
 568   }
 569   if (k != indices_tensor->info()->num_dimensions())
 570   {
 571     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
 572     acl_common::disableDimCorrection(indices_tensor);
 573   }
 574
 575   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
 576       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 577
 578   // Revert disabling applied dim_correction
 579   if (ifm_tensor->dimension(0) == 1)
 580   {
 581     acl_common::enableDimCorrection(ifm_tensor);
 582   }
 583   if (indices_tensor->dimension(0) == 1)
 584   {
 585     acl_common::enableDimCorrection(indices_tensor);
 586   }
 587
 588   _return_fn = asAclFunction(std::move(fn));
 589 }
 590
 591 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 592 {
 593   const auto ofm_index{node.getOutputs().at(0)};
 594   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 595   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 596   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 597
 598   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 599   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 600   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 601   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 602   auto epsilon = node.param().epsilon;
 603   auto activation = node.param().activation;
 604
 605   auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
 606       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 607       epsilon);
 608
 609   _return_fn = std::make_unique<exec::FunctionSequence>(
 610       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 611 }
 612
 613 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 614 {
 615   const auto ofm_index{node.getOutputs().at(0)};
 616   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
 617
 618   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
 619   // L2 Normalization always performs the reduction along the depth axis
 620   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
 621   // choosing normalization parameters as below
 622
 623   const auto &ifm_shape = _ctx.at(ifm_index).shape();
 624   // TODO Support optional constant dimension that normalization would be performed on
 625   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
 626   int32_t radius =
 627       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
 628   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
 629   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
 630   float bias = 0.0f;                             // Don't offset the reduction.
 631
 632   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 633   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 634
 635   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
 636                                                                radius, alpha, beta, bias, false);
 637
 638   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 639       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 640
 641   _return_fn = asAclFunction(std::move(fn));
 642 }
 643
 644 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
 645 {
 646   const auto ofm_index{node.getOutputs().at(0)};
 647   const auto ifm_index{
 648       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 649
 650   auto radius = node.param().radius;
 651   auto alpha = node.param().alpha;
 652   auto beta = node.param().beta;
 653   auto bias = node.param().bias;
 654
 655   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 656   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 657
 658   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
 659       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 660
 661   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 662       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 663
 664   _return_fn = asAclFunction(std::move(fn));
 665 }
 666
 667 void KernelGenerator::visit(const ir::operation::LSTM &node)
 668 {
 669   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
 670                                          ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
 671 }
 672
 673 void KernelGenerator::visit(const ir::operation::Pack &node)
 674 {
 675   const auto output_index{node.getOutputs().at(0)};
 676   auto axis{node.param().axis};
 677
 678   const auto output_rank = _ctx.at(output_index).shape().rank();
 679
 680   std::vector<ir::OperandIndex> input_indexes;
 681   for (const auto &input_index : node.getInputs())
 682     input_indexes.emplace_back(input_index);
 683
 684   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 685   std::vector<arm_compute::ITensor *> inputs;
 686   for (const auto &input_index : input_indexes)
 687     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 688
 689   const auto frontend_layout = _current_op_seq_layout;
 690   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 691
 692   if (axis < 0)
 693     axis += output_rank;
 694   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 695
 696   // Disable applied dim_correction
 697   for (const auto &input_index : input_indexes)
 698   {
 699     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 700     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 701     {
 702       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 703       acl_common::disableDimCorrection(input_tensor);
 704     }
 705   }
 706
 707   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 708
 709   // Revert disabling applied dim_correction
 710   for (const auto &input_index : input_indexes)
 711   {
 712     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 713     if (input_tensor->dimension(0) == 1)
 714     {
 715       acl_common::enableDimCorrection(input_tensor);
 716     }
 717   }
 718
 719   _return_fn = asAclFunction(std::move(fn));
 720 }
 721
 722 void KernelGenerator::visit(const ir::operation::Pad &node)
 723 {
 724   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
 725   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
 726   const auto output_index{node.getOutputs().at(0)};
 727   assert(_ctx.at(pad_index).data());
 728
 729   auto rank = _ctx.at(input_index).shape().rank();
 730   auto pad_base = _ctx.at(pad_index).data()->base();
 731
 732   auto input = _tensor_reg->getAclTensor(input_index)->handle();
 733   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 734
 735   ::arm_compute::PaddingList padding_list;
 736   padding_list.resize(rank);
 737   for (int32_t n = 0; n < rank; ++n)
 738   {
 739     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 740
 741     const auto frontend_layout = _current_op_seq_layout;
 742     const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 743     const auto axis =
 744         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
 745     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
 746   }
 747
 748   const auto input_type = _ctx.at(input_index).typeInfo();
 749   UNUSED_RELEASE(input_type);
 750   assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
 751   assert(input->info()->quantization_info() ==
 752          ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
 753   const auto pixel_value =
 754       ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
 755
 756   auto fn =
 757       acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
 758
 759   _return_fn = asAclFunction(std::move(fn));
 760 }
 761
 762 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 763 {
 764   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
 765       node, _ctx, _tensor_reg, _current_op_seq_layout,
 766       acl_common::convertPoolType(node.param().op_type));
 767
 768   const auto ofm_index{node.getOutputs().at(0)};
 769   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 770   const auto activation = node.param().activation;
 771   _return_fn = std::make_unique<exec::FunctionSequence>(
 772       asAclFunction(std::move(raw_fn)),
 773       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 774 }
 775
 776 void KernelGenerator::visit(const ir::operation::Permute &node)
 777 {
 778   const auto ofm_idx{node.getOutputs().at(0)};
 779   const auto ifm_idx{node.getInputs().at(0)};
 780   const auto permute_type = node.getPermuteType();
 781   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 782   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 783   const auto rank = _ctx.at(ofm_idx).shape().rank();
 784   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 785
 786   std::unique_ptr<::arm_compute::IFunction> fn;
 787   arm_compute::PermutationVector pv;
 788   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 789   {
 790     // WHCN -> CWHN
 791     pv = arm_compute::PermutationVector{2, 0, 1};
 792
 793     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 794                                                            ofm_tensor->handle(), pv);
 795   }
 796   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 797   {
 798     // CWHN -> WHCN
 799     pv = arm_compute::PermutationVector{1, 2, 0};
 800
 801     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 802                                                            ofm_tensor->handle(), pv);
 803   }
 804   else
 805   {
 806     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
 807   }
 808   _return_fn = asAclFunction(std::move(fn));
 809 }
 810
 811 void KernelGenerator::visit(const ir::operation::PReLU &node)
 812 {
 813   const auto ofm_index{node.getOutputs().at(0)};
 814   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
 815   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 816
 817   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 818   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 819   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 820
 821   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
 822       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 823
 824   _return_fn = asAclFunction(std::move(fn));
 825 }
 826
 827 void KernelGenerator::visit(const ir::operation::Reduce &node)
 828 {
 829   const auto output_index{node.getOutputs().at(0)};
 830   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 831   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 832
 833   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 834   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 835
 836   // Convert to ACL axes taking into account negative values and possible duplicates.
 837   const auto &axes = _ctx.at(axes_index);
 838   const auto input_rank = _ctx.at(input_index).shape().rank();
 839   const auto frontend_layout = _current_op_seq_layout;
 840   const auto backend_layout = input_tensor->layout();
 841   const auto reduce_axes =
 842       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 843   const auto reduce_type = node.param().reduce_type;
 844   const auto keep_dims = node.param().keep_dims;
 845
 846   std::unique_ptr<::arm_compute::IFunction> fn;
 847   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 848   {
 849     fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
 850                                                               keep_dims, output_tensor->handle());
 851   }
 852   else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
 853   {
 854     fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
 855                                                              keep_dims, output_tensor->handle());
 856   }
 857   else
 858   {
 859     fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
 860         input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
 861         acl_common::convertReduceType(reduce_type));
 862   }
 863   _return_fn = asAclFunction(std::move(fn));
 864 }
 865
 866 void KernelGenerator::visit(const ir::operation::Reshape &node)
 867 {
 868   const auto output_index{node.getOutputs().at(0)};
 869   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 870
 871   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 872   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 873
 874   // NOTE This operation must not be changed the layout from frontend to backend
 875   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 876   const auto frontend_layout = _current_op_seq_layout;
 877   const auto backend_layout = output_tensor->layout();
 878   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 879          frontend_layout == backend_layout);
 880   UNUSED_RELEASE(frontend_layout);
 881   UNUSED_RELEASE(backend_layout);
 882
 883   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 884                                                                    output_tensor->handle());
 885
 886   _return_fn = asAclFunction(std::move(fn));
 887 }
 888
 889 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 890 {
 891   const auto ofm_index{node.getOutputs().at(0)};
 892   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 893
 894   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 895   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 896
 897   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
 898       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
 899       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
 900       ::arm_compute::SamplingPolicy::TOP_LEFT);
 901
 902   _return_fn = asAclFunction(std::move(fn));
 903 }
 904
 905 void KernelGenerator::visit(const ir::operation::RNN &node)
 906 {
 907   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
 908   const auto hidden_state_out_index{
 909       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 910
 911   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
 912   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
 913   const auto recurrent_weights_index{
 914       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
 915   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
 916   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 917
 918   const auto activation = node.param().activation;
 919
 920   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 921   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 922
 923   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 924   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
 925   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
 926   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 927   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
 928   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 929
 930   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
 931       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
 932   _return_fn = asAclFunction(std::move(copy_layer));
 933
 934   auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
 935       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 936       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
 937       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
 938   _return_fn = asAclFunction(std::move(fn));
 939 }
 940
 941 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 942 {
 943   // Squeeze is identical to reshape except that it has an optional dimensions input.
 944   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 945   // by freezer and toco
 946   const auto output_index{node.getOutputs().at(0)};
 947   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 948   const auto dims{node.param().dims};
 949   const auto ndim{node.param().ndim};
 950   (void)dims;
 951   (void)ndim;
 952
 953   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 954   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 955   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 956                                                                    output_tensor->handle());
 957   _return_fn = asAclFunction(std::move(fn));
 958 }
 959
 960 void KernelGenerator::visit(const ir::operation::Softmax &node)
 961 {
 962   const auto output_index{node.getOutputs().at(0)};
 963   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 964   const auto beta = node.param().beta;
 965
 966   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 967   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 968
 969   // Disable applied dim_correction
 970   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 971   {
 972     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 973     acl_common::disableDimCorrection(input_tensor);
 974   }
 975
 976   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
 977       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 978       output_tensor->handle(), beta);
 979
 980   // Revert disabling applied dim_correction
 981   if (input_tensor->dimension(0) == 1)
 982   {
 983     acl_common::disableDimCorrection(input_tensor);
 984   }
 985
 986   _return_fn = asAclFunction(std::move(fn));
 987 }
 988
 989 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
 990 {
 991   const auto ofm_index{node.getOutputs().at(0)};
 992   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
 993   const auto block_size_index{
 994       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
 995   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 996
 997   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 998   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 999   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1000   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1001
1002   assert(_ctx.at(block_size_index).data());
1003   assert(_ctx.at(paddings_index).data());
1004
1005   auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1006       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1007       ofm_tensor->handle());
1008
1009   _return_fn = asAclFunction(std::move(fn));
1010 }
1011
1012 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1013 {
1014   const auto ofm_index{node.getOutputs().at(0)};
1015   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1016
1017   auto block_size = node.param().block_size;
1018
1019   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1020   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1021
1022   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1023       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1024
1025   _return_fn = asAclFunction(std::move(fn));
1026 }
1027
1028 void KernelGenerator::visit(const ir::operation::Split &node)
1029 {
1030   // TODO Support this op by SubTensor
1031   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1032   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1033
1034   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1035   if (!_ctx.at(axis_index).isConstant())
1036   {
1037     throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1038   }
1039
1040   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1041   std::vector<ir::OperandIndex> output_indexes;
1042   for (const auto &output : node.getOutputs())
1043     output_indexes.emplace_back(output);
1044
1045   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1046   std::vector<arm_compute::ITensor *> output_tensors;
1047   for (const auto &ofm_ind : output_indexes)
1048     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1049
1050   const auto frontend_layout = _current_op_seq_layout;
1051   const auto backend_layout = ifm_tensor->layout();
1052   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1053   if (axis < 0)
1054     axis += ifm_rank;
1055   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1056
1057   auto fn =
1058       acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1059
1060   _return_fn = asAclFunction(std::move(fn));
1061 }
1062
1063 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1064 {
1065   const auto ofm_index{node.getOutputs().at(0)};
1066   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1067   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1068
1069   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1070   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1071   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1072
1073   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1074       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1075
1076   _return_fn = asAclFunction(std::move(fn));
1077 }
1078
1079 void KernelGenerator::visit(const ir::operation::Slice &node)
1080 {
1081   const auto output_index{node.getOutputs().at(0)};
1082   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1083   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1084   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1085
1086   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1087   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1088   const auto frontend_layout = _current_op_seq_layout;
1089   const auto backend_layout = inputData_tensor->layout();
1090
1091   // Set initializers for indices data such as order of inputData
1092   int input_rank = _ctx.at(input_index).shape().rank();
1093   std::vector<int32_t> starts;
1094   std::vector<int32_t> ends;
1095   starts.resize(input_rank, 0);
1096   ends.resize(input_rank, 0);
1097   {
1098     auto beginData_base = _ctx.at(begins_index).data()->base();
1099     auto sizeData_base = _ctx.at(sizes_index).data()->base();
1100     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1101     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1102
1103     using ir::DataType;
1104
1105     UNUSED_RELEASE(beginData_size);
1106     UNUSED_RELEASE(sizeData_size);
1107
1108     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1109     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1110     assert(beginData_size == input_rank);
1111     assert(sizeData_size == input_rank);
1112
1113     assert(beginData_base != nullptr);
1114     for (int n = 0; n < input_rank; ++n)
1115     {
1116       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1117                                                                  backend_layout)
1118                       .value();
1119
1120       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1121       starts[axis] = begin_value;
1122
1123       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1124       ends[axis] = begin_value + size_value;
1125     }
1126   }
1127
1128   ::arm_compute::Coordinates starts_set;
1129   ::arm_compute::Coordinates ends_set;
1130
1131   for (size_t i = 0; i < starts.size(); ++i)
1132   {
1133     starts_set.set(i, starts[i]);
1134     ends_set.set(i, ends[i]);
1135   }
1136
1137   auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1138       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1139
1140   _return_fn = asAclFunction(std::move(fn));
1141 }
1142
1143 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1144 {
1145   const auto output_index{node.getOutputs().at(0)};
1146   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1147   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1148   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1149   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1150
1151   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1152   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1153   const auto frontend_layout = _current_op_seq_layout;
1154   const auto backend_layout = inputData_tensor->layout();
1155
1156   // Set initializers for indices data such as order of inputData
1157   int input_rank = _ctx.at(input_index).shape().rank();
1158   std::vector<int32_t> starts;
1159   std::vector<int32_t> ends;
1160   std::vector<int32_t> strides;
1161   starts.resize(input_rank, 0);
1162   ends.resize(input_rank, 0);
1163   strides.resize(input_rank, 0);
1164   {
1165     auto startData_base = _ctx.at(starts_index).data()->base();
1166     auto endData_base = _ctx.at(ends_index).data()->base();
1167     auto stridesData_base = _ctx.at(strides_index).data()->base();
1168     const int startData_size = _ctx.at(starts_index).shape().num_elements();
1169     const int endData_size = _ctx.at(ends_index).shape().num_elements();
1170     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1171
1172     using ir::DataType;
1173
1174     UNUSED_RELEASE(startData_size);
1175     UNUSED_RELEASE(endData_size);
1176     UNUSED_RELEASE(stridesData_size);
1177
1178     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1179     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1180     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1181     assert(startData_size == input_rank);
1182     assert(endData_size == input_rank);
1183     assert(stridesData_size == input_rank);
1184
1185     assert(startData_base != nullptr);
1186     for (int n = 0; n < input_rank; ++n)
1187     {
1188       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1189                                                                  backend_layout)
1190                       .value();
1191
1192       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1193       starts[axis] = start_value;
1194
1195       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1196       ends[axis] = end_value;
1197
1198       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1199       strides[axis] = strides_value;
1200     }
1201   }
1202
1203   // Set mask bits such as order of inputData
1204   // FIXME Take the layouts into account.
1205   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1206   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1207   const auto shrink_axis_mask =
1208       acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1209
1210   ::arm_compute::Coordinates starts_set;
1211   ::arm_compute::Coordinates ends_set;
1212   ::arm_compute::BiStrides strides_set;
1213
1214   for (size_t i = 0; i < starts.size(); ++i)
1215   {
1216     starts_set.set(i, starts[i]);
1217     ends_set.set(i, ends[i]);
1218     strides_set.set(i, strides[i]);
1219   }
1220
1221   // Disable applied dim_correction
1222   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
1223   {
1224     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1225     acl_common::disableDimCorrection(inputData_tensor);
1226   }
1227
1228   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1229       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1230       begin_mask, end_mask, shrink_axis_mask);
1231
1232   // Revert disabling applied dim_correction
1233   if (inputData_tensor->dimension(0) == 1)
1234   {
1235     acl_common::enableDimCorrection(inputData_tensor);
1236   }
1237
1238   _return_fn = asAclFunction(std::move(fn));
1239 }
1240
1241 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1242 {
1243   const auto ofm_index{node.getOutputs().at(0)};
1244   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1245   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1246
1247   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1248   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1249   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1250
1251   const auto stride = node.param().stride;
1252
1253   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1254          (node.param().padding.type == ir::PaddingType::VALID));
1255   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1256                                       ker_shape.W, ker_shape.H);
1257
1258   uint32_t invalid_horizontal = 0;
1259   uint32_t invalid_vertical = 0;
1260   if (node.param().padding.type == ir::PaddingType::VALID)
1261   {
1262     invalid_horizontal =
1263         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1264     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1265   }
1266
1267   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1268   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1269   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1270
1271   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1272
1273   auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1274       ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1275       invalid_horizontal, invalid_vertical);
1276
1277   _return_fn = asAclFunction(std::move(fn));
1278 }
1279
1280 void KernelGenerator::visit(const ir::operation::Transpose &node)
1281 {
1282   const auto ofm_idx{node.getOutputs().at(0)};
1283   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1284   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1285
1286   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1287   const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1288   const auto frontend_layout = _current_op_seq_layout;
1289   const auto backend_layout = ifm_tensor->layout();
1290   const auto rank = _ctx.at(ifm_idx).shape().rank();
1291
1292   const auto &perms = _ctx.at(perm_idx);
1293   std::vector<int32_t> pv;
1294   if (perms.shape() == ir::Shape{0})
1295   {
1296     pv.resize(rank);
1297     std::iota(pv.begin(), pv.end(), 0);
1298     std::reverse(pv.begin(), pv.end());
1299   }
1300   else
1301   {
1302     pv = _ctx.at(perm_idx).asVector<int32_t>();
1303   }
1304
1305   std::unique_ptr<arm_compute::IFunction> fn;
1306   if (rank == 1)
1307   {
1308     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1309   }
1310   else if (rank == 2)
1311   {
1312     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1313     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1314                                                              ofm_tensor->handle());
1315   }
1316   else
1317   {
1318     auto backend_pv =
1319         acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1320
1321     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1322                                                            ofm_tensor->handle(), backend_pv);
1323   }
1324   _return_fn = asAclFunction(std::move(fn));
1325 }
1326
1327 void KernelGenerator::visit(const ir::operation::Unpack &node)
1328 {
1329   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1330   auto axis{node.param().axis};
1331
1332   const auto input_rank = _ctx.at(input_index).shape().rank();
1333
1334   std::vector<ir::OperandIndex> output_indexes;
1335   for (const auto &output_index : node.getOutputs())
1336     output_indexes.emplace_back(output_index);
1337
1338   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1339   std::vector<arm_compute::ITensor *> outputs;
1340   for (const auto &output_index : output_indexes)
1341     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1342
1343   const auto frontend_layout = _current_op_seq_layout;
1344   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1345   if (axis < 0)
1346     axis += input_rank;
1347   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1348
1349   // Disable applied dim_correction
1350   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1351   {
1352     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1353     acl_common::disableDimCorrection(input_tensor);
1354   }
1355
1356   auto fn =
1357       acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1358
1359   // Revert disabling applied dim_correction
1360   if (input_tensor->dimension(0) == 1)
1361   {
1362     acl_common::enableDimCorrection(input_tensor);
1363   }
1364
1365   _return_fn = asAclFunction(std::move(fn));
1366 }
1367
1368 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1369 {
1370   const auto output_index{node.getOutputs().at(0)};
1371   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1372
1373   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1374   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1375
1376   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1377                                                                    output_tensor->handle());
1378
1379   _return_fn = asAclFunction(std::move(fn));
1380 }
1381
1382 void KernelGenerator::visit(const ir::operation::Comparison &node)
1383 {
1384   const auto output_index{node.getOutputs().at(0)};
1385   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1386   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1387
1388   const auto comparison_type = node.param().comparison_type;
1389
1390   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1391   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1392   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1393
1394   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1395       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1396       (arm_compute::ComparisonOperation)comparison_type);
1397
1398   _return_fn = asAclFunction(std::move(fn));
1399 }
1400
1401 void KernelGenerator::visit(const ir::operation::OneHot &node)
1402 {
1403   const auto out_idx{node.getOutputs().at(0)};
1404   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1405   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1406   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1407   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1408
1409   auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1410   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1411   auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1412   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1413   auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1414
1415   const size_t output_rank = _ctx.at(out_idx).shape().rank();
1416   const auto frontend_layout = _current_op_seq_layout;
1417   const auto backend_layout = output_tensor->layout();
1418   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1419   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1420
1421   auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1422       indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1423       offvalue_tensor->handle(), output_tensor->handle(), axis);
1424   _return_fn = asAclFunction(std::move(fn));
1425 }
1426
1427 } // namespace acl_neon
1428 } // namespace backend
1429 } // namespace onert